-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_post_feats_norm-per-thread.py
660 lines (551 loc) · 27.4 KB
/
extract_post_feats_norm-per-thread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
# coding=utf-8
# python extract_post_feats_norm-per-thread.py ../dataconversion/Viva_forum/samples/106long20threads 106long20threads.postfeats.norm-per-thread.out
# python extract_post_feats_norm-per-thread.py ../dataconversion/GIST_FB/threads/ gistfb.postfeats.norm.out
# python extract_post_feats_norm-per-thread.py ../dataconversion/NYtimes/ nytimes.postfeats.norm.out
# Feature extraction.
# + (a) position in the thread (absolute, relative)
# + (b) popularity (#responses)
# + (c) representativeness for the thread (cosine similarity for tf-idf weighted vector representations of post and thread/title)
# + (d) readability (wordcount, uniquewordcount,type-token ratio, relative punctuation count, average word length, avg sent length
# + (e) prominence of author (# relative posts in thread)
import os
import sys
import re
import math
import string
import operator
import functools
import numpy
from scipy.linalg import norm
import time
import xml.etree.ElementTree as ET
# from xml.dom import minidom
rootdir = sys.argv[1]
featfilename = sys.argv[2]
annotationsfile = ""
if rootdir == "../dataconversion/Viva_forum/samples/106long20threads":
annotationsfile = "../annotation/annotations/selected_posts.txt"
elif rootdir == "../dataconversion/GIST_FB/threads/":
annotationsfile = "../annotation/annotations/selected_posts_sents.txt"
def get_postvotes_per_thread (annotationsfile):
########### FROM ANALYZE_ANNOTATIONS.PY ###########
threads_per_user = dict()
age_per_user = dict()
freq_per_user = dict()
sona_per_user = dict()
email_per_user = dict()
selected_per_thread_and_user = dict()
utility_scores_per_thread = dict()
nrselecteds_per_thread = dict()
nrselecteds_per_user = dict()
with open(annotationsfile,'r') as annotations:
for line in annotations:
columns = line.split("\t")
#Mon Jan 25 12:09:12 2016 Nikki de Groot 19 female not nikkidegroot24@gmail.com 97963
name = columns[1]
email = columns[5]
if len(columns) > 7:
# Mon Jan 25 12:14:31 2016 Nikki de Groot 19 female not nikkidegroot24@gmail.com 97963 103511 2 3 6 7 8 9 10 12 13 22 24 28 2 4
threadid = columns[7]
selected = columns[8]
if (re.match(".*[a-zA-Z0-9].*",name) and not re.match(r'(?i).*(test|suzan).*',name)) \
and (not re.match(r'voorbeeld',threadid) and (not threadid == "239355") and (not threadid == "250167")):
#print name
threadsforuser = dict()
if name in threads_per_user:
threadsforuser = threads_per_user[name]
if threadid not in threadsforuser:
threadsforuser[threadid] = 1
#ann.write(line)
nrselecteds = list()
threads_per_user[name] = threadsforuser
selected_def = dict()
postids = selected.split(" ")
removeatpos = dict()
pos=0
for postid in postids:
if "-" in postid:
removeid = re.sub("-","",postid)
removeatpos[removeid] = pos
pos += 1
pos = 0
for postid in postids:
if re.match("[0-9]+",postid):
if postid in removeatpos:
if pos > removeatpos[postid]:
selected_def[postid] =1
else :
selected_def[postid]=1
selected_per_thread_and_user[(threadid,name)] = selected_def
nrselected = len(selected_def)
nrselecteds.append(nrselected)
nrselecteds_per_thread[threadid] = nrselecteds
nrselecteds_user = list()
if name in nrselecteds_per_user:
nrselecteds_user = nrselecteds_per_user[name]
nrselecteds_user.append(nrselected)
nrselecteds_per_user[name] = nrselecteds_user
#print selected, selected_def
#else:
# sys.stderr.write("Warning: user "+email+" got thread "+threadid+" twice!\n")
print (time.clock(), "\t", "collect postvotes per thread")
postvotes_per_thread = dict()
#out.write("\nthreadid\tuser\t# of selected posts\n")
for (threadid,user) in selected_per_thread_and_user.keys():
selected = selected_per_thread_and_user[(threadid,user)]
#print threadid, user, selected_per_thread_and_user[(threadid,user)], len(selected.keys())
#out.write(threadid+"\t"+user+"\t"+str(len(selected.keys()))+"\n")
postvotes = dict()
if threadid in postvotes_per_thread:
postvotes = postvotes_per_thread[threadid]
for postid in selected:
if postid in postvotes:
postvotes[postid] += 1
else:
postvotes[postid] =1
postvotes_per_thread[threadid] = postvotes
return postvotes_per_thread
###########
def replace_quote(postcontent):
adapted = ""
blocks = re.split("\n\n",postcontent)
#print blocks
# first, find the block with the quote:
bi=0
bc = len(blocks)
quoteblocki = 4
while bi < bc:
if " schreef op " in blocks[bi]:
#print blocks[bi]
quoteblocki = bi
break
# print until the quote:
if not re.match('^>',blocks[bi]):
adapted += blocks[bi]+"<br>\n"
bi += 1
blocks[quoteblocki] = re.sub("^>","",blocks[quoteblocki])
blocks[quoteblocki] = re.sub("\(http://.*\):","",blocks[quoteblocki])
#quote = blocks[quoteblocki]+blocks[quoteblocki+1]+"<br>\n"+blocks[quoteblocki+2]+"<br>\n"+blocks[quoteblocki+3]+"<br>\n"
quote = blocks[quoteblocki]+"<br>\n"
if len(blocks) > quoteblocki+1:
quote += blocks[quoteblocki+1]+"<br>\n"
#adapted += "<div style='background-color:rgb(240,240,240);padding-left:4em;padding-right:4em'>"+quote+"</div><br>\n"
bi = quoteblocki+1
while bi < bc:
adapted += blocks[bi]+"<br>\n"
bi += 1
return adapted
def tokenize(t):
text = t.lower()
text = re.sub("\n"," ",text)
text = re.sub('[^a-zèéeêëûüùôöòóœøîïíàáâäæãåA-Z0-9- \']', "", text)
wrds = text.split()
return wrds
caps = "([A-Z])"
prefixes = "(Dhr|Mevr|Dr|Drs|Mr|Ir|Ing)[.]"
suffixes = "(BV|MA|MSc|BSc|BA)"
starters = "(Dhr|Mevr|Dr|Drs|Mr|Ir|Ing)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|nl)"
def split_into_sentences(text):
# adapted from http://stackoverflow.com/questions/4576077/python-split-text-on-sentences
text = " " + text + " "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = re.sub("([\.\?!]+\)?)","\\1<stop>",text)
if "<stop>" not in text:
text += "<stop>"
text = text.replace("<prd>",".")
text = re.sub(' +',' ',text)
sents = text.split("<stop>")
sents = sents[:-1]
sents = [s.strip() for s in sents]
return sents
def count_punctuation(t):
punctuation = string.punctuation
punctuation_count = len(list(filter(functools.partial(operator.contains, punctuation), t)))
textlength = len(t)
relpc = 0
if textlength>0:
relpc = float(punctuation_count)/float(textlength)
return relpc
def nrofsyllables(w):
count = 0
vowels = 'aeiouy'
w = w.lower().strip(".:;?!")
if w[0] in vowels:
count +=1
for index in range(1,len(w)):
if w[index] in vowels and w[index-1] not in vowels:
count +=1
if w.endswith('e'):
count -= 1
if w.endswith('le'):
count+=1
if count == 0:
count +=1
return count
def readability(wl,sl):
# English: Flesch-Kincaid: 206,835 – (0.85 x nr of syllables per 100 words) – (1.015 x avg nr of words per sentence)
# Dutch: Flesch-Douma: 206.84 - (0.77 * nr of syllables per 100 words) - (0.33 * ( words / Sentences))
flesch = 206.84 - 0.77 * 100* wl - 0.33 * sl
return flesch
def printTopNPosts(d, n):
i = 0
for pid in sorted(d, key=d.get, reverse=True):
i += 1
if i > n:
break
else:
print (pid, d[pid])
# print postid, bodies[pid], d[pid]
def getTopNAuthors(d, n, minnrofposts):
topauthors = list()
i = 0
print (" Top" + str(n) + " authors:")
for aut in sorted(d, key=d.get, reverse=True):
i += 1
print ("\t" + aut, d[aut])
if i > n or d[aut] < minnrofposts:
break
else:
topauthors.extend([aut])
return topauthors
def normalizeVector(termvector):
sumofsquares = 0
for component in termvector.values():
sumofsquares = sumofsquares + component * component
vectorlength = math.sqrt(sumofsquares)
for term in termvector.keys():
normvalue = termvector[term] / vectorlength
termvector[term] = normvalue
return termvector
def fast_cosine_sim(a, b):
#print (a)
if len(b) < len(a):
a, b = b, a
up = 0
a_value_array = []
b_value_array = []
for key in a:
# noinspection PyUnresolvedReferences
a_value = a[key]
# noinspection PyUnresolvedReferences
b_value = b[key]
a_value_array.append(a_value)
b_value_array.append(b_value)
up += a_value * b_value
if up == 0:
return 0
return up / norm(a_value_array) / norm(b_value_array)
columns = dict() # key is feature name, value is dict with key (threadid,postid) and value the feature value
def standardize_values(columndict,feature):
values = list()
for (threadid,postid) in columndict:
values.append(columndict[(threadid,postid)])
mean = numpy.mean(values)
stdev = numpy.std(values)
normdict = dict() # key is (threadid,postid) and value the normalized feature value
for (threadid,postid) in columndict:
value = columndict[(threadid,postid)]
if stdev == 0.0:
stdev = 0.000000000001
print ("stdev is 0! ", feature, value, mean, stdev)
#if value != 0:
normvalue = (float(value)-float(mean))/float(stdev)
normdict[(threadid,postid)] = normvalue
# if feature == "noofupvotes":
# print threadid,postid, feature, float(value), mean, stdev, normvalue, len(columndict)
return normdict
postvotes_per_thread = dict()
if annotationsfile != "":
print ("Read annotations")
postvotes_per_thread = get_postvotes_per_thread(annotationsfile)
print ("Read files in ",rootdir,"\n")
#for focususer in focususers:
#featfile = open("postfeats."+focususer+".out",'w')
featnames = ("threadid","postid","abspos","relpos","noresponses","noofupvotes","cosinesimwthread","cosinesimwtitle","wordcount","uniquewordcount","ttr","relpunctcount","avgwordlength","avgsentlength","relauthorcountsinthread")
featfile = open(featfilename,'w')
for featname in featnames:
featfile.write(featname+"\t")
featfile.write("votes_for_this_post\n")
openingpost_for_thread = dict() # key is threadid, value is id of opening post
postids_dict = dict() # key is (threadid,postid), value is postid. Needed for pasting the columns at the end
threadids = dict() # key is (threadid,postid), value is threadid. Needed for pasting the columns at the end
upvotecounts = dict() # key is (threadid,postid), value is # of upvotes
responsecounts = dict() # key is (threadid,postid), value is # of replies
cosinesimilaritiesthread = dict() # key is (threadid,postid), value is cossim with term vector for complete thread
cosinesimilaritiestitle = dict() # key is (threadid,postid), value is cossim with term vector for title
uniquewordcounts = dict() # key is (threadid,postid), value is unique word count in post
wordcounts = dict() # key is (threadid,postid), value is word count in post
typetokenratios = dict() # key is (threadid,postid), value is type-token ratio in post
abspositions = dict() # key is (threadid,postid), value is absolute position in thread
relpositions = dict() # key is (threadid,postid), value is relative position in thread
relauthorcountsinthreadforpost = dict() # key is (threadid,postid), value is relative number of posts by author in this thread
relpunctcounts = dict() # key is (threadid,postid), value is relative punctuation count in post
avgwordlengths = dict() # key is (threadid,postid), value is average word length (nr of characters)
avgnrsyllablesinwords = dict() # key is (threadid,postid), value is average word length (nr of syllables)
avgsentlengths = dict() # key is (threadid,postid), value is average word length (nr of words)
readabilities = dict() # key is (threadid,postid), value is readability
bodies = dict() # key is (threadid,postid), value is content of post
#print time.clock(), "\t", "go through files"
for f in os.listdir(rootdir):
if f.endswith("xml"):
print (time.clock(), "\t", f)
postids = list()
termvectors = dict() # key is postid, value is dict with term -> termcount for post
termvectorforthread = dict() # key is term, value is termcount for full thread
termvectorfortitle = dict() # key is term, value is termcount for title
authorcountsinthread = dict() # key is authorid, value is number of posts by author in this thread
tree = ET.parse(rootdir+"/"+f)
root = tree.getroot()
for thread in root:
threadid = thread.get('id')
category = thread.find('category').text
title = thread.find('title').text
if title is None:
# no title, then use complete opening post instead
for posts in thread.findall('posts'):
title = posts.findall('post')[0].find('body').text
if title is None:
title = ""
#print threadid,title
titlewords = tokenize(title)
for tw in titlewords:
if tw in termvectorfortitle:
termvectorfortitle[tw] += 1
else:
termvectorfortitle[tw] = 1
# first go through the thread to find all authors
for posts in thread.findall('posts'):
for post in posts.findall('post'):
author = post.find('author').text
if author in authorcountsinthread:
authorcountsinthread[author] += 1
else:
authorcountsinthread[author] =1
postvotes_for_this_thread = dict()
if threadid in postvotes_per_thread:
postvotes_for_this_thread = postvotes_per_thread[threadid]
for posts in thread.findall('posts'):
noofposts = len(posts.findall('post'))
if noofposts > 50:
noofposts = 50
postcount = 0
#print time.clock(), "\t", "extract feats from each post"
for post in posts.findall('post'):
postcount += 1
postid = post.get('id')
if postcount == 1:
openingpost_for_thread[threadid] = postid
#print "opening post:",postid
#print postid, postcount
if 1 < postcount <= 51:
# don't include opening post in feature set
# and include at most 50 responses (because we only have reference data up to post 50)
postids.append(postid)
postids_dict[(threadid,postid)] = postid
threadids[(threadid,postid)] = threadid
parentid = post.find('parentid').text
if parentid != openingpost_for_thread[threadid]:
# do not save responses for openingpost because openingpost will not be in feature file
# (and disturbs the column for standardization)
if (threadid,parentid) in responsecounts:
responsecounts[(threadid,parentid)] += 1
else:
responsecounts[(threadid,parentid)] = 1
#else:
#print "don't add responsecounts because opening post:", parentid
upvotes = 0
upvotesitem = post.find('upvotes')
if upvotesitem is None:
upvotes = 0
else :
upvotes = upvotesitem.text
if upvotes is None:
upvotes = 0
#print threadid,postid,upvotes, responsecounts[(threadid,parentid)]
upvotecounts[(threadid,postid)] = int(upvotes)
selected = post.find('selected').text
# in the NYtimes data, the postvotes are the editor picks ('selected')
# - they are part of the original data and included in the XML
if selected is not None:
postvotes_for_this_thread[postid] = selected
body = post.find('body').text
if postcount > 51:
continue
elif postid=="0":
continue
elif body is None:
body = ""
author = post.find('author').text
relauthorcountsinthreadforpost[(threadid,postid)] = float(authorcountsinthread[author])/float(noofposts)
#print threadid, postid, author, authorcountsinthread[author]
if " schreef op " in body:
body = replace_quote(body)
#print threadid, postid, body
if "smileys" in body:
body = re.sub(r'\((http://forum.viva.nl/global/(www/)?smileys/.*.gif)\)','',body)
if "http" in body:
body = re.sub(r'http://[^ ]+','',body)
bodies[(threadid,postid)] = body
words = tokenize(body)
wc = len(words)
sentences = split_into_sentences(body)
sentlengths = list()
for s in sentences:
sentwords = tokenize(s)
nrofwordsinsent = len(sentwords)
#print s, nrofwordsinsent
sentlengths.append(nrofwordsinsent)
if len(sentences) > 0:
avgsentlength = numpy.mean(sentlengths)
avgsentlengths[(threadid,postid)] = avgsentlength
else:
avgsentlengths[(threadid,postid)] = 0
relpunctcount = count_punctuation(body)
relpunctcounts[(threadid,postid)] = relpunctcount
#print body, punctcount
wordcounts[(threadid,postid)] = wc
uniquewords = dict()
wordlengths = list()
nrofsyllablesinwords = list()
for word in words:
#print word, nrofsyllables(word)
nrofsyllablesinwords.append(nrofsyllables(word))
wordlengths.append(len(word))
uniquewords[word] = 1
if word in termvectorforthread: # dictionary over all posts
termvectorforthread[word] += 1
else:
termvectorforthread[word] = 1
worddict = dict()
if postid in termvectors:
worddict = termvectors[postid]
if word in worddict:
worddict[word] += 1
else:
worddict[word] = 1
termvectors[postid] = worddict
uniquewordcount = len(uniquewords)
uniquewordcounts[(threadid,postid)] = uniquewordcount
readabilities[(threadid,postid)] = 0
if wc > 0:
avgwordlength = numpy.mean(wordlengths)
#avgnrsyllablesinword = numpy.mean(nrofsyllablesinwords)
avgwordlengths[(threadid,postid)] = avgwordlength
#avgnrsyllablesinwords[(threadid,postid)] = avgnrsyllablesinword
#readabilities[(threadid,postid)] = readability(avgnrsyllablesinword,avgsentlength)
else:
avgwordlengths[(threadid,postid)] = 0
#print threadid, postid, wc, avgnrsyllablesinword, avgsentlength, readability(avgnrsyllablesinword,avgsentlength)
typetokenratio = 0
if wordcounts[(threadid,postid)] > 0:
typetokenratio = float(uniquewordcount) / float(wordcounts[(threadid,postid)])
typetokenratios[(threadid,postid)] = typetokenratio
relposition = float(postcount)/float(noofposts)
#relposition = float(postid)/float(noofposts)
relpositions[(threadid,postid)] = relposition
abspositions[(threadid,postid)] = postcount
#abspositions[(threadid,postid)] = postid
postvotes_per_thread[threadid] = postvotes_for_this_thread
#print time.clock(), "\t", "fill term vectors"
#print wordcounts
# add zeroes for titleterms that are not in the thread vector
for titleword in termvectorfortitle:
if titleword not in termvectorforthread:
termvectorforthread[titleword] = 0
# add zeroes for terms that are not in the title vector:
for word in termvectorforthread:
if word not in termvectorfortitle:
termvectorfortitle[word] = 0
# add zeroes for terms that are not in the post vector:
for postid in termvectors:
worddictforpost = termvectors[postid]
for word in termvectorforthread:
if word not in worddictforpost:
worddictforpost[word] = 0
termvectors[postid] = worddictforpost
# for term in termvectorforthread:
# print(postid,term,termvectors[postid][term])
cossimthread = fast_cosine_sim(termvectors[postid], termvectorforthread)
cossimtitle = fast_cosine_sim(termvectors[postid], termvectorfortitle)
cosinesimilaritiesthread[(threadid,postid)] = cossimthread
cosinesimilaritiestitle[(threadid,postid)] = cossimtitle
#print postid, cossimthread
for postid in postids:
#print postid, abspositions[(threadid,postid)]
if not (threadid,postid) in cosinesimilaritiesthread:
cosinesimilaritiesthread[(threadid,postid)] = 0.0
if not (threadid,postid) in cosinesimilaritiestitle:
cosinesimilaritiestitle[(threadid,postid)] = 0.0
if not (threadid,postid) in responsecounts:
# don't store the counts for the openingpost
#print "postid not in responsecounts", postid, "opening post:", openingpost_for_thread[threadid]
responsecounts[(threadid,postid)] = 0
#else:
#sel = 0
#if postid in selected_by_focususer:
# sel = 1
votes_for_this_post = 0
if postid in postvotes_for_this_thread:
votes_for_this_post = postvotes_for_this_thread[postid]
#print time.clock(), "\t", "add feature values to columns for all threads"
#print (time.clock(), "\t", "standardize feat values")
columns_for_thread = dict()
columns_for_thread.clear()
columns_for_thread["threadid"] = threadids
columns_for_thread["postid"] = postids_dict
columns_for_thread["abspos"] = abspositions
columns_for_thread["relpos"] = relpositions
columns_for_thread["noresponses"] = responsecounts
columns_for_thread["noofupvotes"] = upvotecounts
columns_for_thread["cosinesimwthread"] = cosinesimilaritiesthread
columns_for_thread["cosinesimwtitle"] = cosinesimilaritiestitle
columns_for_thread["wordcount"] = wordcounts
columns_for_thread["uniquewordcount"] = uniquewordcounts
columns_for_thread["ttr"] = typetokenratios
columns_for_thread["relpunctcount"] = relpunctcounts
columns_for_thread["avgwordlength"] = avgwordlengths
columns_for_thread["avgsentlength"] = avgsentlengths
columns_for_thread["relauthorcountsinthread"] = relauthorcountsinthreadforpost
columns_std = dict()
for featurename in featnames:
columndict = []
if featurename in columns_for_thread:
columndict = columns_for_thread[featurename]
else:
print("featurename",featurename, "is not in columns dictionary")
columndict_with_std_values = columndict
if featurename != "postid" and featurename != "threadid":
columndict_with_std_values = standardize_values(columndict,featurename)
columns_std[featurename] = columndict_with_std_values
#print (time.clock(), "\t", "print to feature file")
for postid in postids:
for featname in featnames:
columndict_std = columns_std[featname]
#print featname, columndict_std
if (threadid,postid) in columndict_std:
featfile.write(str(columndict_std[(threadid,postid)])+"\t")
else:
featfile.write("NA\t")
print ("not in columndict for feature", featname, ":", threadid, postid)
postvotes_for_this_thread = postvotes_per_thread[threadid]
#print threadid, postvotes_for_this_thread
votes_for_this_post = 0
if postid in postvotes_for_this_thread:
votes_for_this_post = postvotes_for_this_thread[postid]
featfile.write(str(votes_for_this_post)+"\n")
featfile.close()