-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_unified_tsv.py
130 lines (118 loc) · 6.58 KB
/
create_unified_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#---------------------------------------------------------------------------------------
# Name: XML and SQLITE data into TSV
# Purpose: This module is used to combine the data from one of the XML article
# files (training/test/validation/crowd-sourced train/crowd-sourced test)
# with the ground truth from a SQLITE3 table together in a TSV file.
# The 'crowd sourced' XML files are xml files whose data has
# been manually annotated for the SemEval 2019 Task 4.
# IMPORTANT NOTE: this module changes hyperpartisan = true/false in the
# ground truth sqlite table to 1/0.
#
# Execution: python create_unified_tsv.py [-h]
# {training,validation,test,crowdsourced_train,crowdsourced_test}
#
# Author: Ashwath Sampath
#
# Created: 08-11-2018 (V1.0): Initial version which inserts only training
# data into into the file
# Revisions: 22-11-2018 (V1.1): Added options to insert different XML data into
# different files.
# Code improved and made more user-friendly
# Moved select_from_ground_truth to ground_truth_sqlite
# Stop writing bias, url and id into the output file
# 28-11-2018 (V1.2): Adding id to output file again, sem_eval_dir_path
# option added, paths cleaned up.
#
# 12-12-2018 (V1.3): Not adding hyperpartisan to the file in write_to_tsv
# We don't have the ground truth in the test file.
#---------------------------------------------------------------------------------------
from lxml import etree
from bs4 import BeautifulSoup
import sqlite3
import argparse
import csv
import os
import ground_truth_sqlite
def get_xml_root(xml_filepath):
""" Gets the root of the xml tree and returns it."""
doc = etree.parse(xml_filepath)
root = doc.getroot()
return root
def get_xml_file_path(dataset_type, sem_eval_dir_path):
""" Uses the dataset type to get the appropriate article XML file"""
# NOTE: This function needs to be edited when the test set ground truth is available
basepath = os.path.join(sem_eval_dir_path, 'data', 'Articles')
if dataset_type == 'training':
# Buzzfeed training ground truth
return '{}/articles-training-bypublisher-20181122.xml'.format(basepath)
elif dataset_type == 'validation':
# Buzzfeed validation ground truth
return '{}/articles-validation-bypublisher-20181122.xml'.format(basepath)
elif dataset_type == 'crowdsourced_train':
# Crowd-sourced training ground truth
return '{}/articles-training-byarticle-20181122.xml'.format(basepath)
# Buzzfeed test ground truth
elif dataset_type == 'crowdsourced_test':
return 'Dummy: not yet available'
# Crowd-sourced testing ground truth
return 'Dummy: not yet available'
def set_output_file(dataset_type, sem_eval_dir_path):
""" Uses the dataset type to set the appropriate article output (tsv) filename"""
# NOTE: This function needs to be edited when the test set ground truth is available
basepath = os.path.join(sem_eval_dir_path, 'data', 'IntegratedFiles')
if dataset_type == 'training':
# Buzzfeed training ground truth
return '{}/buzzfeed_training_withid.tsv'.format(basepath)
elif dataset_type == 'validation':
# Buzzfeed validation ground truth
return '{}/buzzfeed_validation_withid.tsv'.format(basepath)
elif dataset_type == 'crowdsourced_train':
# Crowd-sourced training ground truth
return '{}/crowdsourced_train_withid.tsv'.format(basepath)
# Buzzfeed test ground truth
elif dataset_type == 'crowdsourced_test':
return '{}/buzzfeed_test_withid.tsv'.format(basepath)
# Crowd-sourced testing ground truth
return '{}/crowdsourced_test_withid'.format(basepath)
def write_to_tsv(output_tsv, xml_file):
""" Read from large xml file and a sqlite table containing additional fields from the
ground truth (both based on table_name), write the results to to a tsv file."""
#fieldnames = ['id', 'title', 'content', 'hyperpartisan', 'bias', 'url']
# DO NOT WRITE THE BIAS AND THE URL into the output file. They won't be used as I think
# they would increase the bias of the Buzzfeed journalists who created the data set.
fieldnames = ['id', 'title', 'content']
training_tsv = open(output_tsv, 'w', encoding='utf-8')
training_writer = csv.DictWriter(training_tsv, delimiter='\t', fieldnames=fieldnames)
#training_writer.writeheader()
la_belle_soupe = BeautifulSoup(open(xml_file).read(),"lxml-xml")
articles_list = la_belle_soupe.find_all("article")
for i in range(len(articles_list)):
current_article = articles_list[i]
# article now contains all the fields of one article
title = current_article['title'].replace('\t', ' ').replace('\n',' ')
identifier = current_article['id']
#published_date = current_article['published-at']
# \n is present in the content. Remove it, or there will be major issues.
text = current_article.get_text().replace('\t', ' ').replace('\n',' ')
training_writer.writerow({'id': identifier, 'title': title, 'content': text})
def main():
""" Main function which parses command-line arguments,
and inserts data from the appropriate ground truth xml file into it"""
parser = argparse.ArgumentParser()
parser.add_argument('type', choices=['training', 'validation', 'test',
'crowdsourced_train', 'crowdsourced_test'],
help='Select the type of dataset to fetch from XML and SQLite')
parser.add_argument("--path",'-p', default="/home/ashwath/Files/SemEval",
help="Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')")
args = parser.parse_args()
sem_eval_dir_path = args.path
db_path = os.path.join(sem_eval_dir_path, 'data', 'Databases', 'ground_truth.sqlite3')
connection = ground_truth_sqlite.db_connect(db_path)
# Results should be fetchable using column names as keys
connection.row_factory = sqlite3.Row
table_name = ground_truth_sqlite.set_sqlite_table_name(args.type)
xml_file = get_xml_file_path(args.type, sem_eval_dir_path)
output_tsv = set_output_file(args.type, sem_eval_dir_path)
write_to_tsv(connection, output_tsv, table_name, xml_file)
if __name__ == '__main__':
main()