-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractAllPosts2LIWC.py
executable file
·101 lines (91 loc) · 2.87 KB
/
extractAllPosts2LIWC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import string
import sys
import csv
import re
import labelNU
#number of LIWC words to consider is arg 1
def myDictFilter(lst, index):
return list(dict.fromkeys(filter(None, lst[index-1][3:])))
def LIWCPosts(path, school, category):
with open('Excel_Data/LIWCdict.csv', 'rU') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
lst = [[x.strip() for x in row] for row in reader]
lst = zip(*lst)
LIWC = {}
LIWC["sex"] = myDictFilter(lst,78)
LIWC["religious"] = myDictFilter(lst,100) + myDictFilter(lst, 101)
LIWC["death"] = myDictFilter(lst, 102)
LIWC["sad"] = myDictFilter(lst, 50)
LIWC["anger"] = myDictFilter(lst, 48) + myDictFilter(lst, 49)
posts = []
catPosts = {}
dictionary = {}
wordSet = []
taggedPost = []
totalPosts = school.split(":")[2]
school = path+school.rsplit(":",1)[0] + "posts.csv"
data = open(school).readlines()
data = data[1:]
wordSet = LIWC[category]
taggedWords = []
dictionary[category] = wordSet
totalLength = 0
allPst = []
for post in data:
post = post.split("|")
pst = str(post[2])
allPst += [pst]
length = len(pst)
totalLength += length
count = 0
words = " ("
for word in wordSet:
if word.endswith("*"):
word = word[:-1]
occur = re.search(word, pst)
if occur is not None and (occur.start() ==0 or pst[occur.start()-1] == " ") and (occur.end() == len(pst) or pst[occur.end()] == " "):
count += 1
if count == int(sys.argv[1]):
words += word + ")"
#pst += words
taggedWords += [words]
taggedPost += [pst]
continue
else:
words += word + ","
rest = list(set(allPst) - set(taggedPost))
catDict = {"sex":"Sexual","death":"Death"}
catPosts[category] = taggedPost
catPosts["words"] = taggedWords
#print "Total posts is ",int(totalPosts)
avg = totalLength/int(totalPosts)
#print "Average length of post is ",avg," characters"
poslabeledPosts = []
labeledPosts = []
#print count
#print labeledPosts
#print catPosts[category]
#print "Number of labeled posts in this category",len(poslabeledPosts)
#print "Number of LIWC-ed posts in this category",len(catPosts[category])
#print catPosts["words"]
return totalPosts, avg, len(catPosts[category])
def main():
categories = ["sex","religious","death","sad","anger"]
fileList = open("lenPosts.dat").readlines()
path = "LIWCPosts/"
for school in fileList:
schoolPrefix = school.rsplit(":",1)[0]
f = open(path+schoolPrefix+".csv",'w')
f.write("category, totalPosts, averageLength, numLIWCPosts \n")
for category in categories:
totalPosts, avg, numLIWCPosts = LIWCPosts("USA/",school, category)
f.write(category+","+str(totalPosts).strip()+","+str(avg)+","+str(numLIWCPosts)+"\n")
print schoolPrefix + ":"+category+"processed..............."
f.close()
print
print schoolPrefix+"completely processed!!!!!!!!!!!!!!!!!!"
print
print
if __name__=='__main__':
main()