Skip to content

Commit 3341ab1

Browse files
authored
Do not intern Strings in putURLItem (#140)
Use StripedLock instead of synchronized block on interned String This will help reducing memory usage for large crawls Signed-off-by: Laurent Klock <Laurent.Klock@arhs-cube.com>
1 parent e753840 commit 3341ab1

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

service/src/main/java/crawlercommons/urlfrontier/service/rocksdb/RocksDBService.java

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
package crawlercommons.urlfrontier.service.rocksdb;
55

6+
import com.google.common.util.concurrent.Striped;
67
import com.google.protobuf.InvalidProtocolBufferException;
78
import crawlercommons.urlfrontier.CrawlID;
89
import crawlercommons.urlfrontier.Urlfrontier.AckMessage.Status;
@@ -32,6 +33,7 @@
3233
import java.util.NoSuchElementException;
3334
import java.util.Set;
3435
import java.util.concurrent.ConcurrentHashMap;
36+
import java.util.concurrent.locks.Lock;
3537
import org.apache.commons.lang3.StringUtils;
3638
import org.rocksdb.BlockBasedTableConfig;
3739
import org.rocksdb.BloomFilter;
@@ -66,9 +68,11 @@ public class RocksDBService extends AbstractFrontierService {
6668

6769
private Statistics statistics;
6870

71+
private static final Striped<Lock> STRIPED_LOCKS = Striped.lock(128); // 128 stripes
72+
6973
// no explicit config
7074
public RocksDBService(String host, int port) {
71-
this(new HashMap<String, String>(), host, port);
75+
this(new HashMap<>(), host, port);
7276
}
7377

7478
private final ConcurrentHashMap<QueueWithinCrawl, QueueWithinCrawl> queuesBeingDeleted =
@@ -390,14 +394,15 @@ protected Status putURLItem(final URLItem value) {
390394
return Status.SKIPPED;
391395
}
392396

393-
// make it intern so that all threads accessing this method
394-
// share the same instance of the String, this way we can synchronize
395-
// on it and make sure that 2 threads working on the same URL won't
397+
final String existenceKeyString = (qk.toString() + "_" + url);
398+
// Synchronize on existence key (avoid interning String to reduce mem usage)
399+
// Make sure that 2 threads working on the same URL won't
396400
// both be considered non-existant
397-
final String existenceKeyString = (qk.toString() + "_" + url).intern();
398-
final byte[] existenceKey = existenceKeyString.getBytes(StandardCharsets.UTF_8);
401+
final Lock existenceLock = lockFor(existenceKeyString);
402+
existenceLock.lock();
399403

400-
synchronized (existenceKeyString) {
404+
try {
405+
final byte[] existenceKey = existenceKeyString.getBytes(StandardCharsets.UTF_8);
401406

402407
// is this URL already known?
403408
try (WriteBatch writeBatch = new WriteBatch();
@@ -468,6 +473,8 @@ protected Status putURLItem(final URLItem value) {
468473
LOG.error("RocksDB exception", e);
469474
return Status.FAIL;
470475
}
476+
} finally {
477+
existenceLock.unlock();
471478
}
472479

473480
return Status.OK;
@@ -1004,4 +1011,8 @@ public void close() {
10041011
this.rocksIterator.close();
10051012
}
10061013
}
1014+
1015+
private static Lock lockFor(String compositeKey) {
1016+
return STRIPED_LOCKS.get(compositeKey);
1017+
}
10071018
}

0 commit comments

Comments
 (0)