diff --git a/README.md b/README.md index c4cfa61..0af638f 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,11 @@ The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/ The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser. +### Source Code Formatting + +Run `mvn spotless:check` and `mvn spotless:apply`, see the [Spotless Maven guide](https://github.com/diffplug/spotless/blob/main/plugin-maven/README.md). Java formatting rules are defined in [eclipse-formatter.xml](eclipse-formatter.xml). + + ## Memory and Disk Requirements Note that the webgraphs are usually multiple Gigabytes in size and require for processing diff --git a/eclipse-formatter.xml b/eclipse-formatter.xml new file mode 100644 index 0000000..80cca1c --- /dev/null +++ b/eclipse-formatter.xml @@ -0,0 +1,404 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pom.xml b/pom.xml index 2e7db53..260f7cd 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,8 @@ - + + 4.0.0 org.commoncrawl @@ -25,68 +28,6 @@ 5.13.2 - - - - src/main/resources - - - - - maven-compiler-plugin - 3.14.0 - - ${java.version} - ${java.version} - - - - maven-assembly-plugin - 3.7.1 - - - jar-with-dependencies - - cc-webgraph-${project.version} - - - - package - - single - - - - - - maven-surefire-plugin - 3.5.2 - - - org.apache.maven.plugins - maven-enforcer-plugin - 3.5.0 - - - enforce-maven - - enforce - - - - - 3.6.3 - - - - - - - - - - - @@ -248,4 +189,89 @@ + + + + + src/main/resources + + + + + maven-compiler-plugin + 3.14.0 + + ${java.version} + ${java.version} + + + + maven-assembly-plugin + 3.7.1 + + + jar-with-dependencies + + cc-webgraph-${project.version} + + + + + single + + package + + + + + maven-surefire-plugin + 3.5.2 + + + org.apache.maven.plugins + maven-enforcer-plugin + 3.5.0 + + + enforce-maven + + enforce + + + + + 3.6.3 + + + + + + + + com.diffplug.spotless + spotless-maven-plugin + 2.46.1 + + + + + pom.xml + + + all + true + false + -1 + recommended_2008_06 + + + + + ${project.basedir}/eclipse-formatter.xml + + + + + + diff --git a/src/main/java/org/commoncrawl/webgraph/CreatePreferenceVector.java b/src/main/java/org/commoncrawl/webgraph/CreatePreferenceVector.java index a4b973a..3d55ee5 100644 --- a/src/main/java/org/commoncrawl/webgraph/CreatePreferenceVector.java +++ b/src/main/java/org/commoncrawl/webgraph/CreatePreferenceVector.java @@ -35,7 +35,6 @@ public class CreatePreferenceVector { long recordsProcessed; long preferenceNamesFound; - public CreatePreferenceVector(double defVal) { defaultPreferenceValue = defVal; } diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java index a47b4be..1b60cf7 100644 --- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java +++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java @@ -337,7 +337,10 @@ public String convertNode(String line) { return null; } if (lastDomain != null && domain.equals(lastDomain.name)) { - // short cut for the common case of many subsequent subdomains of the same domain + /* + * short cut for the common case of many subsequent subdomains of the same + * domain + */ lastDomain.add(id); return null; } diff --git a/src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java b/src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java index f829ccc..cc6bf22 100644 --- a/src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java +++ b/src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java @@ -97,7 +97,7 @@ private void assignRank(int[] ranks, IntComparator comp) { indirectSortPerm[i] = i; } Arrays.parallelQuickSort(0, length, comp, this::swapIndirect); - for (int i = 0; i < length; ) { + for (int i = 0; i < length;) { ranks[indirectSortPerm[i]] = ++i; } indirectSortPerm = null; @@ -139,7 +139,7 @@ public String addRanks(String line) { long id = Long.parseLong(line.substring(0, sep)); // check whether new line is already contained int end = line.lastIndexOf('\n'); - String revHost = line.substring(sep+1); + String revHost = line.substring(sep + 1); float hcv = getHarmonicCentralityValue(id); long hcr = getHarmonicCentralityRank(id); double prv = getPageRankValue(id); @@ -160,7 +160,6 @@ public String addRanks(String line) { return sb.toString(); } - /** * Implementation of {@link JoinSortRanks} for lists exceeding * {@link Arrays#MAX_ARRAY_SIZE}. @@ -229,7 +228,7 @@ private void assignRank(long[][] ranks, LongComparator comp) { BigArrays.set(indirectSortPerm, i, i); } BigArrays.quickSort(0, length, comp, this::swapIndirect); - for (long i = 0; i < length; ) { + for (long i = 0; i < length;) { BigArrays.set(ranks, BigArrays.get(indirectSortPerm, i), ++i); } indirectSortPerm = null; diff --git a/src/main/java/org/commoncrawl/webgraph/explore/Graph.java b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java index fcb6214..c2e4cc1 100644 --- a/src/main/java/org/commoncrawl/webgraph/explore/Graph.java +++ b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java @@ -384,7 +384,6 @@ public static String getTopLevelDomain(String reversedDomainName) { return reversedDomainName; } - /** Intersection of two sorted lists */ public static long[] intersect(long[] a, long[] b) { int m = a.length; @@ -523,7 +522,9 @@ public void subgraphMetrics(long[] nodes) { LOG.info("\toutlinks = {} (links from the subgraph to outer nodes)", clusterOutlinks); LOG.info("\ttotal inlinks = {} (all inlinks)", totalInlinks); LOG.info("\ttotal outlinks = {} (all outlinks)", totalOutlinks); - LOG.info("\tnodes linked = {} (outer nodes linked from subgraph)", sharedSuccessors(nodes, 1, nodes.length).length); - LOG.info("\tnodes linking = {} (outer nodes linking to subgraph)", sharedPredecessors(nodes, 1, nodes.length).length); + LOG.info("\tnodes linked = {} (outer nodes linked from subgraph)", + sharedSuccessors(nodes, 1, nodes.length).length); + LOG.info("\tnodes linking = {} (outer nodes linking to subgraph)", + sharedPredecessors(nodes, 1, nodes.length).length); } } diff --git a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java index 1e091e6..a3f1654 100644 --- a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java +++ b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java @@ -189,15 +189,14 @@ public void sl(String vertexLabel) { public long[] loadVerticesFromFile(String fileName) { AtomicLong lines = new AtomicLong(); try (Stream in = Files.lines(Paths.get(fileName), StandardCharsets.UTF_8)) { - long[] res = in.mapToLong( - label -> { - lines.incrementAndGet(); - long id = g.vertexLabelToId(label); - if (id == -1) { - LOG.debug("Vertex `{}` not found in graph.", label); - } - return id; - }).filter(id -> id > -1).toArray(); + long[] res = in.mapToLong(label -> { + lines.incrementAndGet(); + long id = g.vertexLabelToId(label); + if (id == -1) { + LOG.debug("Vertex `{}` not found in graph.", label); + } + return id; + }).filter(id -> id > -1).toArray(); LOG.info("Loaded {} vertices of {} lines in {}.", res.length, lines, fileName); return res; } catch (IOException e) { diff --git a/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java b/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java index 3cd1ebf..a973b4a 100644 --- a/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java +++ b/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java @@ -26,16 +26,16 @@ void testSimple() { CountingMergedIntIterator iter = new CountingMergedIntIterator(LazyIntIterators.EMPTY_ITERATOR); assertFalse(iter.hasNext()); - int[][][] testArrays = { // - {{0, 1}}, // - {{0}, {1}}, // - {{1}, {0}}, // - {{1}, {0}, {}}, // - {{1}, {0}, {}, {0}, {0}}, // - {{1}, {0}, {}, {0}, {0, 1}}, // + int[][][] testArrays = { // + { { 0, 1 } }, // + { { 0 }, { 1 } }, // + { { 1 }, { 0 } }, // + { { 1 }, { 0 }, {} }, // + { { 1 }, { 0 }, {}, { 0 }, { 0 } }, // + { { 1 }, { 0 }, {}, { 0 }, { 0, 1 } }, // // tests for input arrays with repeating numbers - {{1, 1}, {0, 0}, {}, {0, 0}, {0, 0}}, // - {{1, 1}, {0, 0}, {}, {0}, {0, 1}} // + { { 1, 1 }, { 0, 0 }, {}, { 0, 0 }, { 0, 0 } }, // + { { 1, 1 }, { 0, 0 }, {}, { 0 }, { 0, 1 } } // }; for (int[][] tArrays : testArrays) { @@ -48,7 +48,7 @@ void testSimple() { int totalCount = 0; iter = new CountingMergedIntIterator(tIters); assertTrue(iter.hasNext()); - + assertEquals(0, iter.nextInt()); assertTrue(iter.getCount() > 0); totalCount += iter.getCount();