-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
103 lines (84 loc) · 3.41 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
import json
import os
import csv
import pickle
import spacy
import time
#from encyclopedia_builder import ents_encyc_builder
from named_entity_recognition_articles import spacy_importer_prepper
from article_loader_parser import article_loader_to_df, random_corpus_sampling, article_text_id_assigner
def doc_opener(document_mentions, content, i , c):
docs = document_mentions.tolist()
rows = []
for doc in docs:
for entity in doc:
# documents_all.append(entity, ignore_index=True)
unique_id = entity["unique_id"]
ent = entity["entity"]
rows.append({"unique_id": unique_id, "entity": ent})
# if len(rows) % 9999 == 0:
# df = pd.DataFrame(rows)
# df.to_csv(f"entities_test_{len(rows)}.csv", sep=",", quoting=csv.QUOTE_NONNUMERIC)
df = pd.DataFrame(rows)
df.to_csv(f"entities_backups/{content}_random_sample_{i}_to_{(i + c)}_entities.csv", sep=",", quoting=csv.QUOTE_NONNUMERIC)
global_path ='C:/Users/17742/Desktop/win_art_writing/art_writing/text_cleaned' #windows:
#ubuntu : global_path = '/home/erik/Desktop/Datasets/art/art_writing/text_cleaned', nlp=nlp
path = 'text_cleaned_all'
folder_path = os.path.join(global_path, path)
# os.path.expanduser()
filelist = os.listdir(folder_path)
total = len(filelist)
rints = random_corpus_sampling(total, filelist)
with open("random_integers_master.json", "r") as fp:
loadit = json.load(fp)
#json.dump(rints, fp)
c = 250
nlp = spacy.load('en_core_web_md')#trf'
start = time.time()
article_text_id_assigner(folder_path=folder_path, iterable="all", begin=252548 )
"""
for i in range(0,total, c):
begin = 252548
data = article_loader_to_df(folder_path=folder_path, iterable=c, begin=begin, fileguide=loadit)#, israndom=True
end = time.time()
print(f" now articles {i}to {(i+c)} of {total} with {int((end - start)//60)}:{(end - start)%60} elapsed")
#data1 = data[:a]
try:
document_mentions, document_person_mentions = spacy_importer_prepper(data=data, model='en_core_web_md')
doc_opener(document_mentions, "organizations", i, c)
doc_opener(document_person_mentions, "persons", i, c)
except ValueError:
pass
"""
#data = article_loader_to_df(folder_path=folder_path, iterable=5000, israndom=True)
#print(data)
#plot = timeplot_sentiment(data, "polarity", path)
#path = 'nytimes' # 122660 logged #122791 130 before error
#then 27979 . hyperallergic finished. therfore new begin = 150770 {then plus 500
#artforumen then logged 62414 frieze logged 14818 brings to 228002
#artnet then 24546 bringing grand total to 252548 250,241
#path = 'artnet_articles'
#path = 'frieze'
#path = 'artforum'
#path = 'hyperallergic'
"""import nltk
from nltk import RegexpTokenizer
import re
import string
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from numpy import linspace, loadtxt, ones, convolve
#from textblob import TextBlob
import matplotlib.pyplot as plt
import matplotlib.lines as ln
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter, drange , date2num
#from collections import Counter
from datetime import datetime
import time
#from article_loader_parser import article_text_id_assigner, article_loader_to_df
#from sentiment_analysis_functions import *
#from word_cleaning_functions import *
#from topic_modeling_functions import *
"""