Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/
The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser.


### Source Code Formatting

Run `mvn spotless:check` and `mvn spotless:apply`, see the [Spotless Maven guide](https://github.com/diffplug/spotless/blob/main/plugin-maven/README.md). Java formatting rules are defined in [eclipse-formatter.xml](eclipse-formatter.xml).


## Memory and Disk Requirements

Note that the webgraphs are usually multiple Gigabytes in size and require for processing
Expand Down
404 changes: 404 additions & 0 deletions eclipse-formatter.xml

Large diffs are not rendered by default.

154 changes: 90 additions & 64 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.commoncrawl</groupId>
Expand All @@ -25,68 +28,6 @@
<junit.version>5.13.2</junit.version>
</properties>

<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.14.0</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.7.1</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<finalName>cc-webgraph-${project.version}</finalName>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.5.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.5.0</version>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.6.3</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>



<dependencyManagement>
<dependencies>
<dependency>
Expand Down Expand Up @@ -248,4 +189,89 @@
</dependency>

</dependencies>

<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.14.0</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.7.1</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<finalName>cc-webgraph-${project.version}</finalName>
</configuration>
<executions>
<execution>
<goals>
<goal>single</goal>
</goals>
<phase>package</phase>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.5.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.5.0</version>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.6.3</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>2.46.1</version>
<configuration>
<pom>
<!-- These are the defaults, you can override if you want -->
<includes>
<include>pom.xml</include>
</includes>
<sortPom>
<indentAttribute>all</indentAttribute>
<keepBlankLines>true</keepBlankLines>
<expandEmptyElements>false</expandEmptyElements>
<nrOfIndentSpace>-1</nrOfIndentSpace>
<predefinedSortOrder>recommended_2008_06</predefinedSortOrder>
</sortPom>
</pom>
<java>
<eclipse>
<file>${project.basedir}/eclipse-formatter.xml</file>
</eclipse>
</java>
</configuration>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ public class CreatePreferenceVector {
long recordsProcessed;
long preferenceNamesFound;


public CreatePreferenceVector(double defVal) {
defaultPreferenceValue = defVal;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,10 @@ public String convertNode(String line) {
return null;
}
if (lastDomain != null && domain.equals(lastDomain.name)) {
// short cut for the common case of many subsequent subdomains of the same domain
/*
* short cut for the common case of many subsequent subdomains of the same
* domain
*/
lastDomain.add(id);
return null;
}
Expand Down
7 changes: 3 additions & 4 deletions src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ private void assignRank(int[] ranks, IntComparator comp) {
indirectSortPerm[i] = i;
}
Arrays.parallelQuickSort(0, length, comp, this::swapIndirect);
for (int i = 0; i < length; ) {
for (int i = 0; i < length;) {
ranks[indirectSortPerm[i]] = ++i;
}
indirectSortPerm = null;
Expand Down Expand Up @@ -139,7 +139,7 @@ public String addRanks(String line) {
long id = Long.parseLong(line.substring(0, sep));
// check whether new line is already contained
int end = line.lastIndexOf('\n');
String revHost = line.substring(sep+1);
String revHost = line.substring(sep + 1);
float hcv = getHarmonicCentralityValue(id);
long hcr = getHarmonicCentralityRank(id);
double prv = getPageRankValue(id);
Expand All @@ -160,7 +160,6 @@ public String addRanks(String line) {
return sb.toString();
}


/**
* Implementation of {@link JoinSortRanks} for lists exceeding
* {@link Arrays#MAX_ARRAY_SIZE}.
Expand Down Expand Up @@ -229,7 +228,7 @@ private void assignRank(long[][] ranks, LongComparator comp) {
BigArrays.set(indirectSortPerm, i, i);
}
BigArrays.quickSort(0, length, comp, this::swapIndirect);
for (long i = 0; i < length; ) {
for (long i = 0; i < length;) {
BigArrays.set(ranks, BigArrays.get(indirectSortPerm, i), ++i);
}
indirectSortPerm = null;
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/commoncrawl/webgraph/explore/Graph.java
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,6 @@ public static String getTopLevelDomain(String reversedDomainName) {
return reversedDomainName;
}


/** Intersection of two sorted lists */
public static long[] intersect(long[] a, long[] b) {
int m = a.length;
Expand Down Expand Up @@ -523,7 +522,9 @@ public void subgraphMetrics(long[] nodes) {
LOG.info("\toutlinks = {} (links from the subgraph to outer nodes)", clusterOutlinks);
LOG.info("\ttotal inlinks = {} (all inlinks)", totalInlinks);
LOG.info("\ttotal outlinks = {} (all outlinks)", totalOutlinks);
LOG.info("\tnodes linked = {} (outer nodes linked from subgraph)", sharedSuccessors(nodes, 1, nodes.length).length);
LOG.info("\tnodes linking = {} (outer nodes linking to subgraph)", sharedPredecessors(nodes, 1, nodes.length).length);
LOG.info("\tnodes linked = {} (outer nodes linked from subgraph)",
sharedSuccessors(nodes, 1, nodes.length).length);
LOG.info("\tnodes linking = {} (outer nodes linking to subgraph)",
sharedPredecessors(nodes, 1, nodes.length).length);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -189,15 +189,14 @@ public void sl(String vertexLabel) {
public long[] loadVerticesFromFile(String fileName) {
AtomicLong lines = new AtomicLong();
try (Stream<String> in = Files.lines(Paths.get(fileName), StandardCharsets.UTF_8)) {
long[] res = in.mapToLong(
label -> {
lines.incrementAndGet();
long id = g.vertexLabelToId(label);
if (id == -1) {
LOG.debug("Vertex `{}` not found in graph.", label);
}
return id;
}).filter(id -> id > -1).toArray();
long[] res = in.mapToLong(label -> {
lines.incrementAndGet();
long id = g.vertexLabelToId(label);
if (id == -1) {
LOG.debug("Vertex `{}` not found in graph.", label);
}
return id;
}).filter(id -> id > -1).toArray();
LOG.info("Loaded {} vertices of {} lines in {}.", res.length, lines, fileName);
return res;
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ void testSimple() {
CountingMergedIntIterator iter = new CountingMergedIntIterator(LazyIntIterators.EMPTY_ITERATOR);
assertFalse(iter.hasNext());

int[][][] testArrays = { //
{{0, 1}}, //
{{0}, {1}}, //
{{1}, {0}}, //
{{1}, {0}, {}}, //
{{1}, {0}, {}, {0}, {0}}, //
{{1}, {0}, {}, {0}, {0, 1}}, //
int[][][] testArrays = { //
{ { 0, 1 } }, //
{ { 0 }, { 1 } }, //
{ { 1 }, { 0 } }, //
{ { 1 }, { 0 }, {} }, //
{ { 1 }, { 0 }, {}, { 0 }, { 0 } }, //
{ { 1 }, { 0 }, {}, { 0 }, { 0, 1 } }, //
// tests for input arrays with repeating numbers
{{1, 1}, {0, 0}, {}, {0, 0}, {0, 0}}, //
{{1, 1}, {0, 0}, {}, {0}, {0, 1}} //
{ { 1, 1 }, { 0, 0 }, {}, { 0, 0 }, { 0, 0 } }, //
{ { 1, 1 }, { 0, 0 }, {}, { 0 }, { 0, 1 } } //
};

for (int[][] tArrays : testArrays) {
Expand All @@ -48,7 +48,7 @@ void testSimple() {
int totalCount = 0;
iter = new CountingMergedIntIterator(tIters);
assertTrue(iter.hasNext());

assertEquals(0, iter.nextInt());
assertTrue(iter.getCount() > 0);
totalCount += iter.getCount();
Expand Down