-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathamazon.py
64 lines (54 loc) · 1.78 KB
/
amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import datetime
import csv
import json
### this is test comment
review_list = []
# Read file into a list
# Strip new lines
for line in open('movies.small.txt'):
# Ignore encoding errors (0xf8 error)
try:
encoded_line = line.encode('utf-8')
except Exception:
pass
review_list.append(encoded_line.strip())
# Grouping review into dict of dicts based on the empy string item
grouped_reviews = []
new_group = []
for line in review_list:
if line != '':
new_group.append(line)
else:
grouped_reviews.append(new_group)
new_group = []
# Create a list of dictionaries
list_of_dict = []
for review in grouped_reviews:
new_dict = {}
for item in review:
rev_key, rev_value = item.split(': ', 1)
clean_rev_key = rev_key.split('/')[1]
new_dict[clean_rev_key] = rev_value
list_of_dict.append(new_dict)
# Convert time stamp into time
# Convert score string into float
for review in list_of_dict:
# Convert string into a float
review['score'] = float(review['score'])
# Convert integer timestamp into year, month and date
time_float = float(review['time'])
temp_time = datetime.datetime.fromtimestamp(time_float)
format_time = str(temp_time.month) + ':' + str(temp_time.day) + ':' + str(temp_time.year)
review['time'] = format_time
# Output to tsv
# Create a writer object
writer = csv.DictWriter(open('movie.small.output.tsv', 'w'),
['productId', 'profileName', 'helpfulness', 'score', 'time'],
delimiter = '\t',
extrasaction = 'ignore')
writer.writeheader()
for review in list_of_dict:
writer.writerow(review)
# Output to JSON
with open('movie.small.output.json', 'w') as outfile:
json.dump(list_of_dict, outfile)