From 2e8262ac70618205a43c57fd6171ac51a927edcb Mon Sep 17 00:00:00 2001 From: vasgat Date: Mon, 3 Jun 2019 11:30:01 +0300 Subject: [PATCH] Minor updates --- pom.xml | 39 +++++++++++----- src/main/java/certh/iti/mklab/easie/Main.java | 20 ++++----- .../companymatching/CompanySearcher.java | 44 ++++++++++++++++--- .../CountryAbreviationsLoader.java | 4 +- .../easie/configuration/Configuration.java | 2 + .../easie/executor/handlers/DataHandler.java | 14 +++--- .../executor/handlers/ExtractionHandler.java | 3 +- .../easie/executor/handlers/StoreUtils.java | 38 +++++++++++++++- .../extractors/AbstractHTMLExtractor.java | 3 +- .../dynamicpages/BrowserEmulator.java | 34 ++++++++++++-- .../dynamicpages/DynamicHTMLExtractor.java | 18 ++++---- .../staticpages/GroupHTMLExtractor.java | 11 ++--- .../staticpages/PaginationIterator.java | 18 ++++---- .../staticpages/StaticHTMLExtractor.java | 10 ++--- .../staticpages/StaticHTMLFetcher.java | 4 +- 15 files changed, 192 insertions(+), 70 deletions(-) diff --git a/pom.xml b/pom.xml index b159dad..f1877ad 100644 --- a/pom.xml +++ b/pom.xml @@ -1,9 +1,10 @@ - + 4.0.0 certh.iti.mklab easIE - 0.2 + 0.4 jar @@ -19,7 +20,8 @@ - + certh.iti.mklab.easie.Main @@ -34,6 +36,7 @@ 1.8 1.8 + com.github.fge @@ -46,16 +49,21 @@ 20140107 - certh.iti.mklab + org.apache.commons + commons-csv + 1.4 + + + com.github.vasgat jSimilarity - 0.1 + 0.2.1 jar - + com.google.code.gson gson 2.6.1 - + org.seleniumhq.selenium selenium-firefox-driver @@ -81,10 +89,10 @@ phantomjsdriver 1.2.0 - - org.mongodb - mongo-java-driver - 3.3.0 + + org.mongodb + mongo-java-driver + 3.3.0 org.jsoup @@ -95,6 +103,13 @@ org.seleniumhq.selenium selenium-java 2.52.0 - + + + + org.apache.commons + commons-collections4 + 4.0 + + \ No newline at end of file diff --git a/src/main/java/certh/iti/mklab/easie/Main.java b/src/main/java/certh/iti/mklab/easie/Main.java index 14be6d6..dc3066b 100644 --- a/src/main/java/certh/iti/mklab/easie/Main.java +++ b/src/main/java/certh/iti/mklab/easie/Main.java @@ -23,20 +23,22 @@ import com.github.fge.jsonschema.core.exceptions.ProcessingException; import certh.iti.mklab.easie.executor.WrapperExecutor; import certh.iti.mklab.easie.executor.handlers.DataHandler; + import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; + import org.json.JSONArray; import org.jsoup.select.Selector.SelectorParseException; /** - * * @author vasgat */ public class Main { public static void main(String[] args) throws URISyntaxException { - + //args = new String[1]; + //args[0] = "C:\\Users\\vasgat\\Desktop\\religiousgreece_example.json"; if (args.length == 1) { try { ConfigurationReader reader = new ConfigurationReader(args[0], "."); @@ -51,11 +53,10 @@ public static void main(String[] args) throws URISyntaxException { if (config.store != null) { dh.store(config.store, config.source_name); + } else { + JSONArray array = new JSONArray(dh.exportJson()); + System.out.println(array.toString(4)); } - - JSONArray array = new JSONArray(dh.exportJson()); - - System.out.println(array.toString(4)); } catch (IOException ex) { System.out.println(ex.getMessage()); } catch (ProcessingException ex) { @@ -85,11 +86,10 @@ public static void main(String[] args) throws URISyntaxException { if (config.store != null) { dh.store(config.store, config.source_name); + } else { + JSONArray array = new JSONArray(dh.exportJson()); + System.out.println(array.toString(4)); } - - JSONArray array = new JSONArray(dh.exportJson()); - - System.out.println(array.toString(4)); } catch (IOException ex) { System.out.println(ex.getMessage()); } catch (ProcessingException ex) { diff --git a/src/main/java/certh/iti/mklab/easie/companymatching/CompanySearcher.java b/src/main/java/certh/iti/mklab/easie/companymatching/CompanySearcher.java index 10fc907..61b95a5 100644 --- a/src/main/java/certh/iti/mklab/easie/companymatching/CompanySearcher.java +++ b/src/main/java/certh/iti/mklab/easie/companymatching/CompanySearcher.java @@ -41,10 +41,6 @@ public class CompanySearcher { /** * Creates a CompanySearcher object - * - * @param mongo a MongoUtils object - * @param dbname database name - * @param collection's name */ public CompanySearcher(MongoCollection companies_collection) { this.companies = companies_collection; @@ -111,7 +107,7 @@ public ObjectId search(String company_name, String country, String website) { public ObjectId search(String company_name, String country) { - double threshold = 0.75; + double threshold = 0.8; CompanyDocument document = new CompanyDocument.Builder(company_name) .id("candidate") @@ -163,6 +159,44 @@ public boolean evaluate(CompanyDocument object) { } } + public ObjectId search(String company_name) { + + double threshold = 0.9; + + CompanyDocument document = new CompanyDocument.Builder(company_name) + .id("candidate") + .build(); + + tfidf.calculate(document); + + HashMap candidates = new HashMap(); + + Iterator it = corpus.iterator(); + + while (it.hasNext()) { + CompanyDocument company = it.next(); + + double similarity = tfidf.similarity("candidate", company.id); + + if (similarity >= threshold && !company.equals(document)) { + candidates.put(company.id, similarity); + } + } + + if (candidates.size() > 0) { + String id = (String) MapFunctionsUtils.getTopValues2(candidates, 1).keySet().iterator().next(); + return new ObjectId(id.replaceAll("_.*", "")); + } + + Document query = new Document("aliases", company_name); + Document c = (Document) companies.find(query).iterator().tryNext(); + if (c != null) { + return c.getObjectId("_id"); + } else { + return null; + } + } + /** * Lowercases a set of String objects * diff --git a/src/main/java/certh/iti/mklab/easie/companymatching/CountryAbreviationsLoader.java b/src/main/java/certh/iti/mklab/easie/companymatching/CountryAbreviationsLoader.java index 9209772..d23ea64 100644 --- a/src/main/java/certh/iti/mklab/easie/companymatching/CountryAbreviationsLoader.java +++ b/src/main/java/certh/iti/mklab/easie/companymatching/CountryAbreviationsLoader.java @@ -47,7 +47,7 @@ private HashMap getCountry2ABR() throws FileNotFoundException, I String line; while ((line = in.readLine()) != null) { String[] array = line.split(";"); - ABR2Letter.put(array[1], array[0].toLowerCase()); + ABR2Letter.put(array[1].toLowerCase(), array[0].toLowerCase()); } in.close(); return ABR2Letter; @@ -59,7 +59,7 @@ private HashMap getCountry3ABR() throws FileNotFoundException, I String line; while ((line = in.readLine()) != null) { String[] array = line.split(";"); - ABR3Letter.put(array[2], array[0].toLowerCase()); + ABR3Letter.put(array[2].toLowerCase(), array[0].toLowerCase()); } in.close(); return ABR3Letter; diff --git a/src/main/java/certh/iti/mklab/easie/configuration/Configuration.java b/src/main/java/certh/iti/mklab/easie/configuration/Configuration.java index 1e57caa..3ab8176 100644 --- a/src/main/java/certh/iti/mklab/easie/configuration/Configuration.java +++ b/src/main/java/certh/iti/mklab/easie/configuration/Configuration.java @@ -126,6 +126,8 @@ public class Store { public String metrics_collection; public DBCreadentials db_credentials; + + public String wikirate_metric_designer; } public class DBCreadentials { diff --git a/src/main/java/certh/iti/mklab/easie/executor/handlers/DataHandler.java b/src/main/java/certh/iti/mklab/easie/executor/handlers/DataHandler.java index 9aee89a..af25a89 100644 --- a/src/main/java/certh/iti/mklab/easie/executor/handlers/DataHandler.java +++ b/src/main/java/certh/iti/mklab/easie/executor/handlers/DataHandler.java @@ -19,11 +19,13 @@ import certh.iti.mklab.easie.MongoUtils; import certh.iti.mklab.easie.configuration.Configuration.Store; import com.mongodb.client.MongoCollection; + import java.io.FileNotFoundException; import java.io.IOException; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.List; + import org.bson.Document; /** @@ -42,7 +44,7 @@ public class DataHandler { * Creates SnippetHandler Object * * @param extracted_company_info: extracted company fields - * @param extracted_metrics: extracted snippet fields + * @param extracted_metrics: extracted snippet fields * @throws Exception */ public DataHandler(List extracted_company_info, List> extracted_metrics) { @@ -55,9 +57,9 @@ public DataHandler(List extracted_company_info, List temp_metrics = extracted_metrics.get(j); + for (int i = 0; i < temp_metrics.size(); i++) { + if (!temp_metrics.get(i).getString("name").equals("crawl_to")) { + JSONObject current_metric = new JSONObject(temp_metrics.get(i).toJson()); + String metric_string = wikirate_metric_designer + "+" + current_metric.getString("name") + ",\"" + company.getString("company_name").replace("\"", "") + "\"," + current_metric.get("citeyear") + "," + current_metric.get("value") + ",\"" + current_metric.get("source") + "\"\n"; + + csv += metric_string; + number_of_metrics++; + } + } + } + } + return csv; + } + public int getNumberOfExtractedMetrics() { if (number_of_metrics == 0) { exportJson(); diff --git a/src/main/java/certh/iti/mklab/easie/extractors/AbstractHTMLExtractor.java b/src/main/java/certh/iti/mklab/easie/extractors/AbstractHTMLExtractor.java index 6120573..2e2b213 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/AbstractHTMLExtractor.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/AbstractHTMLExtractor.java @@ -19,7 +19,8 @@ import java.io.IOException; import java.net.URISyntaxException; import java.util.List; -import javafx.util.Pair; +import org.apache.commons.lang3.tuple.Pair; + /** * AbstractHTMLExtractor Object * @author vasgat diff --git a/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/BrowserEmulator.java b/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/BrowserEmulator.java index 99c66aa..b0d537d 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/BrowserEmulator.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/BrowserEmulator.java @@ -34,8 +34,8 @@ public class BrowserEmulator extends Fetcher { public WebDriver driver; public BrowserEmulator(String baseURL, String relativeURL, String ChromeDriverPath) throws InterruptedException { - driver = Selenium.setUpChromeDriver(ChromeDriverPath); - driver.manage().window().setPosition(new Point(-2000, 0)); + +// driver.manage().window().setPosition(new Point(-2000, 0)); driver.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS); driver.get( baseURL + relativeURL @@ -43,9 +43,9 @@ public BrowserEmulator(String baseURL, String relativeURL, String ChromeDriverPa Thread.sleep(10000); } - public BrowserEmulator(String fullURL, String ChromeDriverPath) throws InterruptedException { + public BrowserEmulator(String fullURL, String ChromeDriverPath) throws InterruptedException { driver = Selenium.setUpChromeDriver(ChromeDriverPath); - driver.manage().window().setPosition(new Point(-2000, 0)); +// driver.manage().window().setPosition(new Point(-2000, 0)); driver.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS); driver.get( fullURL @@ -111,6 +111,32 @@ public void scrollDownEvent(int timesToRepeat) throws InterruptedException { } } + public void scrollUpEvent(int timesToRepeat) throws InterruptedException { + Random rand = new Random(); + String currentDoc = ""; + for (int i = 0; i < timesToRepeat; i++) { + currentDoc = driver.getPageSource(); + int counter = 0; + while (((Document) this.getHTMLDocument()).select("._hnn7m").size() > 0 && counter <= 600000) { + Thread.sleep(2000); + counter = 2000; + } + if (counter > 600000) { + break; + } + JavascriptExecutor jse = (JavascriptExecutor) driver; + jse.executeScript("window.scrollBy(0," + (-300 - rand.nextInt(312)) + ")", ""); + int sleep_time = rand.nextInt(3000); + Thread.sleep(2142 + sleep_time); + + /*if (currentDoc.equals(driver.getPageSource())){ + jse.executeScript("scroll(0, -25);"); + Thread.sleep(1968+rand.nextInt(12365)); + jse.executeScript("scroll(0, 250);"); + }*/ + } + } + /** * Closes browser driver */ diff --git a/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/DynamicHTMLExtractor.java b/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/DynamicHTMLExtractor.java index ca6371c..4a60351 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/DynamicHTMLExtractor.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/dynamicpages/DynamicHTMLExtractor.java @@ -21,11 +21,13 @@ import certh.iti.mklab.easie.exception.HTMLElementNotFoundException; import certh.iti.mklab.easie.extractors.FieldExtractor; import certh.iti.mklab.easie.extractors.TableFieldExtractor; + import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; -import javafx.util.Pair; + +import org.apache.commons.lang3.tuple.Pair; import org.jsoup.nodes.Element; /** @@ -44,7 +46,7 @@ public class DynamicHTMLExtractor extends AbstractHTMLExtractor { /** * Creates a new DynamicHTMLWrapper for a webpage * - * @param base_url: webpage base url + * @param base_url: webpage base url * @param relative_url: path to the specific spot in the page * @throws URISyntaxException * @throws IOException @@ -88,7 +90,7 @@ public List extractFields(List fields) { * extracts data from the specified table fields * * @param tableSelector: CSS table selector - * @param fields: list of table fields + * @param fields: list of table fields * @return an ArrayList of HashMap (corresponds to the extracted table * fields) */ @@ -96,9 +98,9 @@ public List extractFields(List fields) { public List extractTable(String tableSelector, List fields) { TableFieldExtractor extractor = new TableFieldExtractor( - (Element) browser_emulator.getHTMLDocument(), - tableSelector, source - ); + (Element) browser_emulator.getHTMLDocument(), + tableSelector, source + ); try { return extractor.getExtractedFields(fields, FIELD_TYPE.METRIC); } catch (HTMLElementNotFoundException ex) { @@ -110,7 +112,7 @@ public List extractTable(String tableSelector, List fields) { @Override public Pair extractFields(List cfields, List sfields) throws URISyntaxException, IOException { FieldExtractor extractor = new FieldExtractor((Element) browser_emulator.getHTMLDocument(), source); - return new Pair( + return Pair.of( extractor.getExtractedFields(cfields, FIELD_TYPE.COMPANY_INFO), extractor.getExtractedFields(sfields, FIELD_TYPE.METRIC) ); @@ -129,7 +131,7 @@ public Pair extractTable(String tableSelector, List cfields, Lis System.out.println(ex.getMessage()); } - return new Pair( + return Pair.of( extracted_company_info, extracted_metric_info ); diff --git a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/GroupHTMLExtractor.java b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/GroupHTMLExtractor.java index 68be765..76f734f 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/GroupHTMLExtractor.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/GroupHTMLExtractor.java @@ -15,9 +15,9 @@ */ package certh.iti.mklab.easie.extractors.staticpages; -import certh.iti.mklab.easie.extractors.staticpages.SingleStaticPageExtractor; import certh.iti.mklab.easie.configuration.Configuration.ScrapableField; import certh.iti.mklab.easie.extractors.AbstractHTMLExtractor; + import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; @@ -32,7 +32,8 @@ import java.util.concurrent.Future; import java.util.logging.Level; import java.util.logging.Logger; -import javafx.util.Pair; + +import org.apache.commons.lang3.tuple.Pair; import org.bson.Document; /** @@ -79,7 +80,7 @@ public List extractFields(List fields) throws URISynta Iterator links = group_of_urls.iterator(); ExecutorService executorService = Executors.newFixedThreadPool(numThreads); - + List> handles = new ArrayList>(); while (links.hasNext()) { handles.add(executorService.submit(new SingleStaticPageExtractor( @@ -179,7 +180,7 @@ public Pair extractFields(List cfields, List sfi } } executorService.shutdownNow(); - return new Pair(extractedCFields, extractedSFields); + return Pair.of(extractedCFields, extractedSFields); } @Override @@ -212,6 +213,6 @@ public Pair extractTable(String tableSelector, List cfields, Lis } } executorService.shutdownNow(); - return new Pair(extractedCFields, extractedSFields); + return Pair.of(extractedCFields, extractedSFields); } } diff --git a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/PaginationIterator.java b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/PaginationIterator.java index 4c55df6..809109a 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/PaginationIterator.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/PaginationIterator.java @@ -15,10 +15,10 @@ */ package certh.iti.mklab.easie.extractors.staticpages; -import certh.iti.mklab.easie.extractors.staticpages.SingleStaticPageExtractor; import certh.iti.mklab.easie.URLPatterns; import certh.iti.mklab.easie.configuration.Configuration.ScrapableField; import certh.iti.mklab.easie.extractors.AbstractHTMLExtractor; + import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -33,7 +33,8 @@ import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; -import javafx.util.Pair; + +import org.apache.commons.lang3.tuple.Pair; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -58,7 +59,6 @@ public class PaginationIterator extends AbstractHTMLExtractor { /** * Creates a new PaginationIterator * - * @param wrapper StaticHTMLExtractor object of an Instance page * @param next_page_selector next Page CSS selector in the page * @throws URISyntaxException * @throws IOException @@ -104,7 +104,7 @@ public List extractFields(List fields) throws URISyntax * Extracts data from a table from each page until no next page exists * * @param tableSelector CSS table selector - * @param fields List of table fields we want to extract + * @param fields List of table fields we want to extract * @return the extracted table fields as a List of HashMaps * @throws URISyntaxException * @throws IOException @@ -113,7 +113,7 @@ public List extractFields(List fields) throws URISyntax @Override public List extractTable(String tableSelector, List fields) throws URISyntaxException, IOException, InterruptedException { ArrayList extractedFields = new ArrayList(); - + if (thereisPattern) { extractedFields.addAll((ArrayList) MultiThreadPagination( frontPattern, @@ -152,7 +152,7 @@ public Pair extractFields(List cfields, List sfi extractedCFields.addAll((ArrayList) (((Pair) temp.get(i)).getKey())); extractedSFields.addAll((ArrayList) (((Pair) temp.get(i)).getValue())); } - return new Pair(extractedCFields, extractedSFields); + return Pair.of(extractedCFields, extractedSFields); } @Override @@ -160,7 +160,7 @@ public Pair extractTable(String tableSelector, List cfields, Lis ArrayList extractedCFields = new ArrayList(); ArrayList extractedSFields = new ArrayList(); ArrayList temp; - + if (thereisPattern) { temp = (ArrayList) MultiThreadPagination( frontPattern, @@ -178,7 +178,7 @@ public Pair extractTable(String tableSelector, List cfields, Lis extractedCFields.addAll((ArrayList) (((Pair) temp.get(i)).getKey())); extractedSFields.addAll((ArrayList) (((Pair) temp.get(i)).getValue())); } - return new Pair(extractedCFields, extractedSFields); + return Pair.of(extractedCFields, extractedSFields); } private boolean thereisPattern() throws IOException, URISyntaxException { @@ -218,7 +218,7 @@ public boolean thereisPattern(String url1, String url2) throws IOException, URIS startPage = temp2 - 2 * step; thereisPattern = true; } - + return URLPatterns.isInteger(url2.replace(frontPattern, "").replace(rearPattern, "")); } diff --git a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLExtractor.java b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLExtractor.java index 5fe3218..3b207ed 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLExtractor.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLExtractor.java @@ -25,7 +25,7 @@ import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; -import javafx.util.Pair; +import org.apache.commons.lang3.tuple.Pair; import org.bson.Document; import org.jsoup.nodes.Element; @@ -72,7 +72,7 @@ public StaticHTMLExtractor(String FullLink) throws URISyntaxException, IOExcepti /** * extracts data from a list of specified fields from a webpage * - * @param fields: list of fields + * @param sfields: list of fields * @return a HashMap of the extracted fields */ @Override @@ -84,7 +84,7 @@ public List extractFields(List sfields) { /** * extracts data from the specified table fields * - * @param tableSelector: CSS table selector + * @param table_selector: CSS table selector * @param fields: list of table fields * @return an ArrayList of HashMap (corresponds to the extracted table * fields) @@ -103,7 +103,7 @@ public List extractTable(String table_selector, List f @Override public Pair extractFields(List cfields, List sfields) { FieldExtractor extractor = new FieldExtractor((Element) fetcher.getHTMLDocument(), source); - return new Pair( + return Pair.of( extractor.getExtractedFields(cfields, FIELD_TYPE.COMPANY_INFO), extractor.getExtractedFields(sfields, FIELD_TYPE.METRIC) ); @@ -121,7 +121,7 @@ public Pair extractTable(String table_selector, List cfields, Li System.out.println(ex.getMessage()); } - return new Pair( + return Pair.of( extracted_company_info, extracted_metric_info ); diff --git a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLFetcher.java b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLFetcher.java index 63ae6c8..37cbe46 100644 --- a/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLFetcher.java +++ b/src/main/java/certh/iti/mklab/easie/extractors/staticpages/StaticHTMLFetcher.java @@ -34,14 +34,14 @@ public class StaticHTMLFetcher extends Fetcher { private Connection connection; public StaticHTMLFetcher(String baseURL, String relativeURL) throws URISyntaxException, IOException { - connection = Jsoup.connect(new URI(baseURL + relativeURL).toASCIIString()) + connection = Jsoup.connect(new URI(baseURL + relativeURL).toASCIIString()).followRedirects(true).ignoreHttpErrors(true) .userAgent("Mozilla/37.0").timeout(60000); document = connection.get(); responseCode = connection.response().statusCode(); } public StaticHTMLFetcher(String fullURL) throws URISyntaxException, IOException { - connection = Jsoup.connect(new URI(fullURL).toASCIIString()) + connection = Jsoup.connect(new URI(fullURL).toASCIIString()).followRedirects(true).ignoreHttpErrors(true) .userAgent("Mozilla/37.0").timeout(60000); document = connection.get(); responseCode = connection.response().statusCode();