-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping.py
109 lines (81 loc) · 2.87 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from bs4 import BeautifulSoup
import requests
import csv
import json
import pandas as pd
import jieba
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def url2json(url):
u_response = requests.get(url)
u_json_data = u_response.json()
return u_json_data
def watch_data(uid, page=0):
url = f"https://ukamnads.icu/api/v2/user?uId=" + str(uid) + "&pageNum=" + str(
page) + "&pageSize=50&target=-1&useEmoji=true"
return url2json(url)['data']
def generate_cloud(uid):
data = watch_data(uid)['data']
word_freq = {}
for user in data:
count = user['count']
name = user['uName']
word_freq[name] = count
text = ''.join(jieba.cut(' '.join(word_freq.keys())))
if len(word_freq) == 0:
print("This person has never watched a live stream.")
return
wordcloud = WordCloud(font_path='ZiYuYongSongTi-2.ttf', width=800, height=400, background_color='white')
wordcloud.generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# wordcloud.to_file("wordcloud.png")
def uid_search(uid):
# 定义key, type0为评论,2为消费,4为进场消息
keys = ['cUid', 'cUname', 'cParentArea', 'cArea', 'type', 'sendDate', 'message', 'price', 'count']
df = pd.DataFrame(columns=keys)
file_name = str(uid) + ".csv"
folder_name = "users"
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
file_path = os.path.join(folder_path, file_name)
page_num = 0
while True:
w_data = watch_data(uid, page_num)
if w_data['total'] == 0 or w_data['total'] == -1:
return
flag = w_data['hasMore']
w_data = w_data['data']
page_num += 1
records = w_data['records']
# 依次遍历每一页的每一条
for record in records:
cUid = record['channel']['uId']
cUname = record['channel']['uName']
cParentArea = record.get('live', {}).get('parentArea', '')
cArea = record.get('live', {}).get('area', '')
danmakus = record['danmakus']
for danmaku in danmakus:
row = {}
row['cUid'] = cUid
row['cUname'] = cUname
row['cParentArea'] = cParentArea
row['cArea'] = cArea
for key in keys:
if key in danmaku:
row[key] = danmaku[key]
df = pd.concat([df, pd.DataFrame(row, index=[0])], ignore_index=True)
if not flag:
break
df.to_csv(file_path, index=False)
# uid_range
start_uid = 210000
end_uid = 210100
for i in range(start_uid, end_uid):
print(i)
uid_search(i)