Merge pull request #650 from zutto/master

Fix autocrawler crashing
yacy · Jul 10, 2024 · 2f5f3f8 · 2f5f3f8
2 parents 326b5f6 + 5268ae2
commit 2f5f3f8
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 3 deletions.
diff --git a/htroot/Autocrawl_p.html b/htroot/Autocrawl_p.html
@@ -20,7 +20,7 @@ <h2>Autocrawler</h2>
     			#(changed)#::<dt></dt><dd><span class="error">You need to restart for some settings to be applied</span></dd>#(/changed)#
     			<dt>Enable Autocrawler:</dt>
     			<dd><input id="autocrawlEnable" name="autocrawlEnable" type="checkbox" #(autocrawlEnable)#::checked="checked"#(/autocrawlEnable)# /></dd>
-    			<dt>Deep crawl every:</dt>
+    			<dt>Deep crawl every Nth document:</dt>
     			<dd>
     				<input id="autocrawlRatio" name="autocrawlRatio" type="number" min="1" max="500" step="1" size="2" maxlength="2" value="#[autocrawlRatio]#" />
     				Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.
@@ -47,4 +47,4 @@ <h2>Autocrawler</h2>
     		</dl>
     	</form>
     </fieldset>
-</body>
+</body>
diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf
@@ -211,7 +211,7 @@
        <source>Enable Autocrawler:</source>
     </trans-unit>
     <trans-unit id="66a1bd2c" xml:space="preserve" approved="no" translate="yes">
-       <source>Deep crawl every:</source>
+       <source>Deep crawl every Nth document:</source>
     </trans-unit>
     <trans-unit id="2291c65d" xml:space="preserve" approved="no" translate="yes">
        <source>Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.</source>

diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -608,12 +608,19 @@ public boolean autocrawlJob() {
             int i = 0;
             int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50"));
             for (SolrDocument doc: resp.getResults()) {
+		if (doc == null) {
+		    continue;
+		}
                 boolean deep = false;
                 i++;
                 if( i % deepRatio == 0 ){
                     deep = true;
                 }
                 DigestURL url;
+		if (doc.getFieldValue("url_protocol_s") == null || doc.getFieldValue("host_s") == null) {
+			//Skip this document if either of these values is null.
+			continue; 
+		}
                 final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString();
                 try {
                     url = new DigestURL(u);