Skip to content

Commit 247b201

Browse files
authored
Merge pull request #93 from klockla/listurl_github
Add method ListURLs to list all URLs known in the frontier with their next fetch date
2 parents 1d99fab + ea9090e commit 247b201

File tree

14 files changed

+24068
-19989
lines changed

14 files changed

+24068
-19989
lines changed

API/src/main/java/crawlercommons/urlfrontier/URLFrontierGrpc.java

Lines changed: 2102 additions & 1383 deletions
Large diffs are not rendered by default.

API/src/main/java/crawlercommons/urlfrontier/Urlfrontier.java

Lines changed: 21129 additions & 18579 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

API/urlfrontier.proto

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ service URLFrontier {
7878
Used to check current status of an URL within the frontier
7979
**/
8080
rpc GetURLStatus(URLStatusRequest) returns (URLItem) {}
81+
82+
/** List all URLs currently in the frontier
83+
This does not take into account URL scheduling.
84+
Used to check current status of all URLs within the frontier
85+
**/
86+
rpc ListURLs(ListUrlParams) returns (stream URLItem) {}
8187
}
8288

8389
/**
@@ -306,3 +312,15 @@ message URLStatusRequest {
306312
string crawlID = 3;
307313
}
308314

315+
message ListUrlParams {
316+
// position of the first result in the list; defaults to 0
317+
uint32 start = 1;
318+
// max number of values; defaults to 100
319+
uint32 size = 2;
320+
/** ID for the queue **/
321+
string key = 3;
322+
// crawl ID
323+
string crawlID = 4;
324+
// only for the current local instance
325+
bool local = 5;
326+
}

client/src/main/java/crawlercommons/urlfrontier/client/Client.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
ListNodes.class,
1616
ListQueues.class,
1717
ListCrawls.class,
18+
ListURLs.class,
1819
GetStats.class,
1920
PutURLs.class,
2021
GetURLs.class,
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// SPDX-FileCopyrightText: 2020 Crawler-commons
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package crawlercommons.urlfrontier.client;
5+
6+
import com.google.protobuf.InvalidProtocolBufferException;
7+
import com.google.protobuf.util.JsonFormat;
8+
import com.google.protobuf.util.JsonFormat.Printer;
9+
import crawlercommons.urlfrontier.URLFrontierGrpc;
10+
import crawlercommons.urlfrontier.URLFrontierGrpc.URLFrontierBlockingStub;
11+
import crawlercommons.urlfrontier.Urlfrontier.ListUrlParams.Builder;
12+
import crawlercommons.urlfrontier.Urlfrontier.URLItem;
13+
import io.grpc.ManagedChannel;
14+
import io.grpc.ManagedChannelBuilder;
15+
import java.io.File;
16+
import java.io.IOException;
17+
import java.io.PrintStream;
18+
import java.nio.charset.Charset;
19+
import java.nio.file.Files;
20+
import java.time.Instant;
21+
import java.time.LocalDateTime;
22+
import java.time.ZoneId;
23+
import java.util.Iterator;
24+
import picocli.CommandLine.Command;
25+
import picocli.CommandLine.Option;
26+
import picocli.CommandLine.ParentCommand;
27+
28+
@Command(
29+
name = "ListURLs",
30+
description = "Prints out all URLs in the Frontier",
31+
sortOptions = false)
32+
public class ListURLs implements Runnable {
33+
34+
@ParentCommand private Client parent;
35+
36+
@Option(
37+
names = {"-n", "--number_urls"},
38+
defaultValue = "100",
39+
paramLabel = "NUM",
40+
description = "maximum number of URLs to return (default 100)")
41+
private int maxNumURLs;
42+
43+
@Option(
44+
names = {"-s", "--start"},
45+
defaultValue = "0",
46+
paramLabel = "NUM",
47+
description = "starting position of URL to return (default 0)")
48+
private int start;
49+
50+
@Option(
51+
names = {"-k", "--key"},
52+
required = false,
53+
paramLabel = "STRING",
54+
description = "key to use to target a specific queue")
55+
private String key;
56+
57+
@Option(
58+
names = {"-o", "--output"},
59+
defaultValue = "",
60+
paramLabel = "STRING",
61+
description = "output file to dump all the URLs")
62+
private String output;
63+
64+
@Option(
65+
names = {"-c", "--crawlID"},
66+
defaultValue = "DEFAULT",
67+
paramLabel = "STRING",
68+
description = "crawl to get the queues for")
69+
private String crawl;
70+
71+
@Option(
72+
names = {"-l", "--local"},
73+
defaultValue = "false",
74+
paramLabel = "BOOLEAN",
75+
description =
76+
"restricts the scope to this frontier instance instead of aggregating over the cluster")
77+
private Boolean local;
78+
79+
@Option(
80+
names = {"-j", "--json"},
81+
defaultValue = "false",
82+
paramLabel = "BOOLEAN",
83+
description = "Outputs in JSON format")
84+
private Boolean json;
85+
86+
@Option(
87+
names = {"-p", "--parsedate"},
88+
defaultValue = "false",
89+
description = {
90+
"Print the refetch date in local time zone",
91+
"By default, time is UTC seconds since the Unix epoch",
92+
"Ignored if JSON output is selected"
93+
})
94+
private boolean parse;
95+
96+
// Use the system default time zone
97+
private ZoneId zoneId = ZoneId.systemDefault();
98+
99+
@Override
100+
public void run() {
101+
102+
Builder builder = crawlercommons.urlfrontier.Urlfrontier.ListUrlParams.newBuilder();
103+
builder.setLocal(local);
104+
if (key != null) {
105+
builder.setKey(key);
106+
}
107+
builder.setSize(maxNumURLs);
108+
builder.setStart(start);
109+
builder.setCrawlID(crawl);
110+
111+
PrintStream outstream = null;
112+
if (output.length() > 0) {
113+
File f = new File(output);
114+
try {
115+
Files.deleteIfExists(f.toPath());
116+
outstream = new PrintStream(f, Charset.defaultCharset());
117+
} catch (IOException e) {
118+
e.printStackTrace(System.err);
119+
return;
120+
}
121+
} else {
122+
outstream = System.out;
123+
}
124+
125+
Printer jprinter = JsonFormat.printer();
126+
127+
ManagedChannel channel =
128+
ManagedChannelBuilder.forAddress(parent.hostname, parent.port)
129+
.usePlaintext()
130+
.build();
131+
URLFrontierBlockingStub blockingFrontier = URLFrontierGrpc.newBlockingStub(channel);
132+
133+
Iterator<URLItem> it = blockingFrontier.listURLs(builder.build());
134+
while (it.hasNext()) {
135+
136+
URLItem item = it.next();
137+
138+
String fetchDate;
139+
if (parse) {
140+
Instant instant = Instant.ofEpochSecond(item.getKnown().getRefetchableFromDate());
141+
LocalDateTime localDate = instant.atZone(zoneId).toLocalDateTime();
142+
fetchDate = localDate.toString();
143+
} else {
144+
fetchDate = String.valueOf(item.getKnown().getRefetchableFromDate());
145+
}
146+
147+
if (Boolean.TRUE.equals(json)) {
148+
try {
149+
outstream.println(jprinter.print(item));
150+
} catch (InvalidProtocolBufferException e) {
151+
e.printStackTrace(System.err);
152+
break;
153+
}
154+
} else {
155+
outstream.println(item.getKnown().getInfo().getUrl() + ";" + fetchDate);
156+
}
157+
}
158+
159+
outstream.close();
160+
channel.shutdownNow();
161+
}
162+
}

service/src/main/java/crawlercommons/urlfrontier/service/AbstractFrontierService.java

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import crawlercommons.urlfrontier.Urlfrontier.Boolean;
1414
import crawlercommons.urlfrontier.Urlfrontier.Empty;
1515
import crawlercommons.urlfrontier.Urlfrontier.GetParams;
16+
import crawlercommons.urlfrontier.Urlfrontier.KnownURLItem;
1617
import crawlercommons.urlfrontier.Urlfrontier.Local;
1718
import crawlercommons.urlfrontier.Urlfrontier.LogLevelParams;
1819
import crawlercommons.urlfrontier.Urlfrontier.QueueDelayParams;
@@ -872,4 +873,95 @@ public void close() throws IOException {
872873
public abstract void getURLStatus(
873874
crawlercommons.urlfrontier.Urlfrontier.URLStatusRequest request,
874875
io.grpc.stub.StreamObserver<URLItem> responseObserver);
876+
877+
public void listURLs(
878+
crawlercommons.urlfrontier.Urlfrontier.ListUrlParams request,
879+
io.grpc.stub.StreamObserver<crawlercommons.urlfrontier.Urlfrontier.URLItem>
880+
responseObserver) {
881+
882+
long maxURLs = request.getSize();
883+
long start = request.getStart();
884+
String key = request.getKey();
885+
886+
final String normalisedCrawlID = CrawlID.normaliseCrawlID(request.getCrawlID());
887+
888+
// 100 by default
889+
if (maxURLs == 0) {
890+
maxURLs = 100;
891+
}
892+
893+
LOG.info(
894+
"Received request to list URLs [size {}; start {}; crawlId {}, key {}]",
895+
maxURLs,
896+
start,
897+
normalisedCrawlID,
898+
key);
899+
900+
long totalCount = -1;
901+
long sentCount = 0;
902+
903+
synchronized (getQueues()) {
904+
Iterator<Entry<QueueWithinCrawl, QueueInterface>> qiterator =
905+
getQueues().entrySet().iterator();
906+
907+
while (qiterator.hasNext() && sentCount < maxURLs) {
908+
Entry<QueueWithinCrawl, QueueInterface> e = qiterator.next();
909+
910+
// check that it is within the right crawlID
911+
if (!e.getKey().getCrawlid().equals(normalisedCrawlID)) {
912+
continue;
913+
}
914+
915+
// check that it is within the right key/queue
916+
if (key != null && !key.isEmpty() && !e.getKey().getQueue().equals(key)) {
917+
continue;
918+
}
919+
920+
Iterator<URLItem> urliter = urlIterator(e);
921+
922+
while (urliter.hasNext()) {
923+
totalCount++;
924+
if (totalCount < start) {
925+
urliter.next();
926+
} else if (sentCount < maxURLs) {
927+
responseObserver.onNext(urliter.next());
928+
sentCount++;
929+
} else {
930+
break;
931+
}
932+
}
933+
}
934+
}
935+
936+
responseObserver.onCompleted();
937+
}
938+
939+
protected Iterator<URLItem> urlIterator(Entry<QueueWithinCrawl, QueueInterface> qentry) {
940+
return urlIterator(qentry, 0L, Long.MAX_VALUE);
941+
}
942+
943+
protected abstract Iterator<URLItem> urlIterator(
944+
Entry<QueueWithinCrawl, QueueInterface> qentry, long start, long max);
945+
946+
/**
947+
* Builds an URLItem for listURLs (used by fetchURLItems, avoids builder instantiation for every
948+
* URL)
949+
*
950+
* @param builder The URLItem builder
951+
* @param kbuilder The KnownURLItem builder
952+
* @param info URLInfo
953+
* @param refetch refetch date from Epoch in seconds
954+
* @return
955+
*/
956+
public static URLItem buildURLItem(
957+
URLItem.Builder builder, KnownURLItem.Builder kbuilder, URLInfo info, long refetch) {
958+
builder.clear();
959+
kbuilder.clear();
960+
961+
kbuilder.setInfo(info);
962+
kbuilder.setRefetchableFromDate(refetch);
963+
builder.setKnown(kbuilder.build());
964+
965+
return builder.build();
966+
}
875967
}

service/src/main/java/crawlercommons/urlfrontier/service/ignite/IgniteService.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import crawlercommons.urlfrontier.Urlfrontier.AckMessage.Status;
1010
import crawlercommons.urlfrontier.Urlfrontier.GetParams;
1111
import crawlercommons.urlfrontier.Urlfrontier.KnownURLItem;
12+
import crawlercommons.urlfrontier.Urlfrontier.ListUrlParams;
1213
import crawlercommons.urlfrontier.Urlfrontier.URLInfo;
1314
import crawlercommons.urlfrontier.Urlfrontier.URLItem;
1415
import crawlercommons.urlfrontier.Urlfrontier.URLStatusRequest;
@@ -29,6 +30,7 @@
2930
import java.util.Collections;
3031
import java.util.HashMap;
3132
import java.util.HashSet;
33+
import java.util.Iterator;
3234
import java.util.Map;
3335
import java.util.Set;
3436
import java.util.concurrent.ConcurrentHashMap;
@@ -848,4 +850,17 @@ protected Status putURLItem(URLItem value) {
848850
public void getURLStatus(URLStatusRequest request, StreamObserver<URLItem> responseObserver) {
849851
responseObserver.onError(io.grpc.Status.UNIMPLEMENTED.asException());
850852
}
853+
854+
@Override
855+
// TODO Implementation of listURLs for Ignite
856+
public void listURLs(ListUrlParams request, StreamObserver<URLItem> responseObserver) {
857+
responseObserver.onError(io.grpc.Status.UNIMPLEMENTED.asException());
858+
}
859+
860+
@Override
861+
// TODO Implementation of listURLs for Ignite
862+
protected Iterator<URLItem> urlIterator(
863+
java.util.Map.Entry<QueueWithinCrawl, QueueInterface> qentry, long start, long max) {
864+
throw new UnsupportedOperationException("Feature not implemented for Ignite backend");
865+
}
851866
}

0 commit comments

Comments
 (0)