Skip to content

Commit

Permalink
Minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
vasgat committed Jun 3, 2019
1 parent b62474e commit 2e8262a
Show file tree
Hide file tree
Showing 15 changed files with 192 additions and 70 deletions.
39 changes: 27 additions & 12 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>certh.iti.mklab</groupId>
<artifactId>easIE</artifactId>
<version>0.2</version>
<version>0.4</version>
<packaging>jar</packaging>
<build>
<plugins>
Expand All @@ -19,7 +20,8 @@
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>certh.iti.mklab.easie.Main</mainClass>
</transformer>
</transformers>
Expand All @@ -34,6 +36,7 @@
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>com.github.fge</groupId>
Expand All @@ -46,16 +49,21 @@
<version>20140107</version>
</dependency>
<dependency>
<groupId>certh.iti.mklab</groupId>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>com.github.vasgat</groupId>
<artifactId>jSimilarity</artifactId>
<version>0.1</version>
<version>0.2.1</version>
<type>jar</type>
</dependency>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.6.1</version>
</dependency>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-firefox-driver</artifactId>
Expand All @@ -81,10 +89,10 @@
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.3.0</version>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
Expand All @@ -95,6 +103,13 @@
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.52.0</version>
</dependency>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.0</version>
</dependency>

</dependencies>
</project>
20 changes: 10 additions & 10 deletions src/main/java/certh/iti/mklab/easie/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,22 @@
import com.github.fge.jsonschema.core.exceptions.ProcessingException;
import certh.iti.mklab.easie.executor.WrapperExecutor;
import certh.iti.mklab.easie.executor.handlers.DataHandler;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;

import org.json.JSONArray;
import org.jsoup.select.Selector.SelectorParseException;

/**
*
* @author vasgat
*/
public class Main {

public static void main(String[] args) throws URISyntaxException {

//args = new String[1];
//args[0] = "C:\\Users\\vasgat\\Desktop\\religiousgreece_example.json";
if (args.length == 1) {
try {
ConfigurationReader reader = new ConfigurationReader(args[0], ".");
Expand All @@ -51,11 +53,10 @@ public static void main(String[] args) throws URISyntaxException {

if (config.store != null) {
dh.store(config.store, config.source_name);
} else {
JSONArray array = new JSONArray(dh.exportJson());
System.out.println(array.toString(4));
}

JSONArray array = new JSONArray(dh.exportJson());

System.out.println(array.toString(4));
} catch (IOException ex) {
System.out.println(ex.getMessage());
} catch (ProcessingException ex) {
Expand Down Expand Up @@ -85,11 +86,10 @@ public static void main(String[] args) throws URISyntaxException {

if (config.store != null) {
dh.store(config.store, config.source_name);
} else {
JSONArray array = new JSONArray(dh.exportJson());
System.out.println(array.toString(4));
}

JSONArray array = new JSONArray(dh.exportJson());

System.out.println(array.toString(4));
} catch (IOException ex) {
System.out.println(ex.getMessage());
} catch (ProcessingException ex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@ public class CompanySearcher {

/**
* Creates a CompanySearcher object
*
* @param mongo a MongoUtils object
* @param dbname database name
* @param collection's name
*/
public CompanySearcher(MongoCollection companies_collection) {
this.companies = companies_collection;
Expand Down Expand Up @@ -111,7 +107,7 @@ public ObjectId search(String company_name, String country, String website) {

public ObjectId search(String company_name, String country) {

double threshold = 0.75;
double threshold = 0.8;

CompanyDocument document = new CompanyDocument.Builder(company_name)
.id("candidate")
Expand Down Expand Up @@ -163,6 +159,44 @@ public boolean evaluate(CompanyDocument object) {
}
}

public ObjectId search(String company_name) {

double threshold = 0.9;

CompanyDocument document = new CompanyDocument.Builder(company_name)
.id("candidate")
.build();

tfidf.calculate(document);

HashMap<String, Double> candidates = new HashMap();

Iterator<CompanyDocument> it = corpus.iterator();

while (it.hasNext()) {
CompanyDocument company = it.next();

double similarity = tfidf.similarity("candidate", company.id);

if (similarity >= threshold && !company.equals(document)) {
candidates.put(company.id, similarity);
}
}

if (candidates.size() > 0) {
String id = (String) MapFunctionsUtils.getTopValues2(candidates, 1).keySet().iterator().next();
return new ObjectId(id.replaceAll("_.*", ""));
}

Document query = new Document("aliases", company_name);
Document c = (Document) companies.find(query).iterator().tryNext();
if (c != null) {
return c.getObjectId("_id");
} else {
return null;
}
}

/**
* Lowercases a set of String objects
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ private HashMap<String, String> getCountry2ABR() throws FileNotFoundException, I
String line;
while ((line = in.readLine()) != null) {
String[] array = line.split(";");
ABR2Letter.put(array[1], array[0].toLowerCase());
ABR2Letter.put(array[1].toLowerCase(), array[0].toLowerCase());
}
in.close();
return ABR2Letter;
Expand All @@ -59,7 +59,7 @@ private HashMap<String, String> getCountry3ABR() throws FileNotFoundException, I
String line;
while ((line = in.readLine()) != null) {
String[] array = line.split(";");
ABR3Letter.put(array[2], array[0].toLowerCase());
ABR3Letter.put(array[2].toLowerCase(), array[0].toLowerCase());
}
in.close();
return ABR3Letter;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ public class Store {
public String metrics_collection;

public DBCreadentials db_credentials;

public String wikirate_metric_designer;
}

public class DBCreadentials {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
import certh.iti.mklab.easie.MongoUtils;
import certh.iti.mklab.easie.configuration.Configuration.Store;
import com.mongodb.client.MongoCollection;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.List;

import org.bson.Document;

/**
Expand All @@ -42,7 +44,7 @@ public class DataHandler {
* Creates SnippetHandler Object
*
* @param extracted_company_info: extracted company fields
* @param extracted_metrics: extracted snippet fields
* @param extracted_metrics: extracted snippet fields
* @throws Exception
*/
public DataHandler(List<Document> extracted_company_info, List<ArrayList<Document>> extracted_metrics) {
Expand All @@ -55,9 +57,9 @@ public DataHandler(List<Document> extracted_company_info, List<ArrayList<Documen
* Stores the extracted data (companies_fields and extracted_metrics) into
* mongodb or drive
*
* @param store contains information about where the data are going to be
* stored
* @param sourceName the data were collected
* @param store contains information about where the data are going to be
* stored
* @param source_name
* @throws UnknownHostException
* @throws FileNotFoundException
* @throws Exception
Expand Down Expand Up @@ -103,8 +105,10 @@ public void store(Store store, String source_name) throws UnknownHostException,
storeUtils.toMongoDB(metrics_collection);

client.close();
} else if (store.format.equals("json")) {
storeUtils.toJSONFile(store.hd_path);
} else {
storeUtils.toFile(store.hd_path);
storeUtils.toCSVFile(store.hd_path, store.wikirate_metric_designer);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@

import certh.iti.mklab.easie.configuration.Configuration;
import certh.iti.mklab.easie.extractors.AbstractHTMLExtractor;
import org.apache.commons.lang3.tuple.Pair;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import javafx.util.Pair;

/**
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ public void toMongoDB(MongoCollection metrics_collection) {
}
}

public void toFile(String filePath) throws FileNotFoundException, UnsupportedEncodingException {
public void toJSONFile(String filePath) throws FileNotFoundException, UnsupportedEncodingException {
PrintWriter writer = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream(filePath), "UTF8"));
Expand All @@ -155,6 +155,15 @@ public void toFile(String filePath) throws FileNotFoundException, UnsupportedEnc
writer.close();
}

public void toCSVFile(String filePath, String metric_designer) throws FileNotFoundException, UnsupportedEncodingException, IOException {
PrintWriter writer = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream(filePath), "UTF8"));

writer.println(exportCSV(metric_designer));
writer.close();
}

/**
* @returns the extracted data into JSON format
*/
Expand Down Expand Up @@ -209,6 +218,33 @@ public String exportJson() {

}

public String exportCSV(String wikirate_metric_designer) {
String csv = "";

if (extracted_company_info != null) {

for (int j = 0; j < extracted_company_info.size(); j++) {
if (extracted_company_info.isEmpty()) {
continue;
}

JSONObject company = new JSONObject(extracted_company_info.get(j).toJson());

ArrayList<Document> temp_metrics = extracted_metrics.get(j);
for (int i = 0; i < temp_metrics.size(); i++) {
if (!temp_metrics.get(i).getString("name").equals("crawl_to")) {
JSONObject current_metric = new JSONObject(temp_metrics.get(i).toJson());
String metric_string = wikirate_metric_designer + "+" + current_metric.getString("name") + ",\"" + company.getString("company_name").replace("\"", "") + "\"," + current_metric.get("citeyear") + "," + current_metric.get("value") + ",\"" + current_metric.get("source") + "\"\n";

csv += metric_string;
number_of_metrics++;
}
}
}
}
return csv;
}

public int getNumberOfExtractedMetrics() {
if (number_of_metrics == 0) {
exportJson();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.List;
import javafx.util.Pair;
import org.apache.commons.lang3.tuple.Pair;

/**
* AbstractHTMLExtractor Object
* @author vasgat
Expand Down
Loading

0 comments on commit 2e8262a

Please sign in to comment.