-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexisnexis.py
246 lines (190 loc) · 7.72 KB
/
lexisnexis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python
# encoding: utf-8
"""
Created by Neal Caren on 2012-05-14.
neal.caren@unc.edu
Edited by Alex Hanna to make this into a module and handle several different news sources.
alex.hanna@gmail.com
Takes a downloaded plain text LexisNexis file and returns list of documents.
"""
from docx import Document
import os
import re
import sys
from datetime import datetime
def isMonth(x):
isMonth = True
try:
datetime.strptime(x, "%B")
except ValueError:
isMonth = False
if isMonth:
return isMonth
try:
datetime.strptime(x, "%b")
except ValueError:
isMonth = False
return isMonth
def isInt(x):
try:
int(x)
return True
except ValueError:
return False
def parseDate(date_ed):
## format date into YYYY-MM-DD and edition if it exists
## NYT: July 27 2008 Sunday Late Edition - Final
## USATODAY: April 7, 1997, Monday, FINAL EDITION
## WaPo: June 06, 1996, Thursday, Final Edition
## Corporate Counsel: June 2014
## 04/30/2014
date_ed = date_ed.replace(',', '')
da = re.split('\s+', date_ed)
da2 = re.split('/', date_ed)
if len(da) >= 3 and isInt(da[1]) and isInt(da[2]):
## NYT, USA, WaPo
date = datetime.strptime(" ".join(da[0:3]), "%B %d %Y")
elif len(da) >= 2 and isInt(da[1]) and isMonth(da[0]):
## Corporate Counsel: June 2014 Northeast
date = datetime.strptime(" ".join(da[0:2]), "%B %Y")
elif len(da2) > 2:
date = datetime.strptime(" ".join(da2[0:3]), "%m %d %Y")
else:
print("WARNING: Not a date: %s" % " ".join(da))
date = None
date = date.strftime("%Y-%m-%d")
ed = " ".join( map(lambda x: x.strip(), da[4:]) )
## if edition is a time or day, skip it
if 'GMT' in ed or 'day' in ed:
ed = ''
return (date, ed)
def parseIndividualLexisNexis(filename):
""" Parse individual docx Lexis-Nexis file."""
meta_dict = {}
## get the title from the file name
title = filename.split('/')[-1].replace('.docx', '')
title = title.replace('_', ':')
try:
doc = Document(filename)
except IOError:
return False
paras = [x.text for x in filter(lambda x: len(x.text) > 0, doc.paragraphs)]
date, ed = parseDate(paras[1])
## if there's no date, return empty record
if not date:
return None
pub = paras[0]
meta_dict['PUBLICATION'] = pub
meta_dict['DATE'] = date
meta_dict['TITLE'] = title
meta_dict['EDITION'] = ed
## get the start and end index of the body paragraphs
## which start with "Body" and end with "Classification"
start_index = 0
end_index = 0
for i, p in enumerate(paras):
if p == 'Body':
start_index = i+1
if p == 'Classification':
end_index = i
## Section:\xa0NEWS; Pg. 1
if re.match(r'Section:', p):
section = p.replace('Section:', '')
section = section.replace('\xa0', '')
meta_dict['SECTION'] = section
## since JSON won't preserve escaped newlines
meta_dict['TEXT'] = "<br/>".join(paras[start_index:end_index])
meta_dict['INTERNAL_ID'] = "%s_%s_%s" % (pub, date, title)
meta_dict['DOCSOURCE'] = "Lexis-Nexis (%s)" % filename
return meta_dict
def parseLexisNexis(filename):
abstracts = []
text = open(filename, 'r').read()
# Figure out what metadata is being reported
meta_list = list(set(re.findall('\\n([A-Z][A-Z-]*?):', text)))
## set permanent columns
header = ['INTERNAL_ID', 'PUBLICATION', 'DATE', 'TITLE', 'EDITION']
## silly hack to find the end of the documents
## TK: This will break on abstracts
# text = re.sub(' Copyright .*?\\r\\n','ENDOFILE', text)
# clean up crud at the beginning of the file
text = text.replace('\xef\xbb\xbf\r\n','')
## other crud
text = text.replace('\\xa0', '')
## warning strings break date parsing, remove them
warnings = [r'This content, including derivations, may not be stored or distributed in any\s+manner, disseminated, published, broadcast, rewritten\s+or reproduced without\s+express, written consent from STPNS\s+',
r'No City News Service material may be republished without the express written\s+permission of the City News Service(,)* Inc\.\s+',
r'Distributed by McClatchy\-Tribune Business News\s+',
r'Distributed by Tribune Content Agency\s+',
r'This content is provided to LexisNexis by Comtex News Network(,)* Inc\.\s+']
for w in warnings:
text = re.sub(w, '', text)
## Split by LN header
## odd numbers are search_id, evens are the documents
docs = []
ids = []
for i, d in enumerate(re.split(r'\s+(\d+) of \d+ DOCUMENTS', text)):
if i == 0:
pass
elif i % 2 == 0:
docs.append(d)
else:
ids.append(d)
# remove blank rows in Python 2
if (sys.version_info < (3, 0)):
docs = [f for f in docs if len(re.split(r'\r\n\r\n', f)) > 2]
# Keep only the commonly occuring metadata
meta_list = [m for m in meta_list if float(text.count(m)) / len(docs) > 0.20]
articles = []
## Begin loop over each article
for i, f in enumerate(docs):
# Split into lines, and clean up the hard returns at the end of each line
if (sys.version_info < (3, 0)):
lines = [row.replace('\r\n', ' ').strip() for row in f.split('\r\n\r\n') if len(row) > 0]
else:
lines = [row.replace('\n', ' ').strip() for row in f.split('\n\n') if len(row) > 0]
## With an abstract, this is the format:
# Copyright 1990 The New York Times Company: Abstracts
# WALL STREET JOURNAL
## Skip the whole article if it's an abstract
if 'Abstracts' in lines[0]:
abstracts.append(lines[0])
continue
## remove copyright
lines = [row for row in lines if not re.match("^Copyright \d+.*$", row) and 'All Rights Reserved' not in row]
## make metadata dict
meta_dict = {k : '' for k in header}
# doc_id = lines[0].strip().split(' ')[0]
pub = lines[0].strip()
date_ed = lines[1].strip()
title = lines[2].strip()
date, ed = parseDate(date_ed)
## if there's no parsable date, go to next record
if not date:
continue
## Edit the text and other information
paragraphs = []
for line in lines[3:]:
## find out if this line is part of the main text
if len(line) > 0 and line[:2] != ' ' and line != line.upper() and len(re.findall('^[A-Z][A-Z-]*?:',line)) == 0 and title not in line:
## remove new lines
line = re.sub(r'\s+', ' ', line)
line = line.replace('","','" , "')
## add to paragraph array
paragraphs.append(line)
else:
metacheck = re.findall('^([A-Z][A-Z-]*?):', line)
if len(metacheck) > 0:
if metacheck[0] in meta_list:
meta_dict[metacheck[0]] = line.replace(metacheck[0] + ': ','')
## put everything in the metadata dictionary
meta_dict['PUBLICATION'] = pub
meta_dict['DATE'] = date
meta_dict['TITLE'] = title
meta_dict['EDITION'] = ed
## since JSON won't preserve escaped newlines
meta_dict['TEXT'] = "<br/>".join(paragraphs)
meta_dict['INTERNAL_ID'] = "%s_%s_%s" % (pub, date, ids[i])
articles.append(meta_dict)
print("\tAdded %d articles, skipped %d abstracts" % (len(articles), len(abstracts)))
return articles