-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVK-Communities-Parser.py
140 lines (118 loc) · 5.46 KB
/
VK-Communities-Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests
import re
import demoji
import time
# Function to get the group ID based on its domain name
def get_group_id(group_domain, access_token):
base_url = 'https://api.vk.com/method/groups.getById'
params = {
'group_id': group_domain,
'access_token': access_token,
'v': '5.131'
}
response = requests.get(base_url, params=params)
data = response.json()
group_id = data['response'][0]['id']
return group_id
# Function to get posts from a VKontakte group
def get_posts(group_id, access_token, count=100):
base_url = 'https://api.vk.com/method/wall.get'
posts = []
offset = 0
while count > 0:
params = {
'owner_id': -group_id,
'count': min(count, 100),
'offset': offset,
'access_token': access_token,
'v': '5.131'
}
response = requests.get(base_url, params=params)
data = response.json()
if 'response' in data and 'items' in data['response']:
items = data['response']['items']
# Extract the text from posts and add them to the list
posts.extend([post['text'].replace('\n', ' ') for post in items if post['text']])
count -= len(items)
offset += len(items)
else:
print('Error occurred while getting posts. Please try adjusting the recommended delay.')
break
# Insert a delay between requests to avoid exceeding API limitations
time.sleep(0) # Recommended delay in case of errors: 0.35 seconds
return posts
# Function to save posts to a file
def save_posts_to_file(posts, filename, append=False):
mode = 'a' if append else 'w'
with open(filename, mode, encoding='utf-8') as file:
# Write the posts to the file, separating them with double line breaks
file.write('\n\n'.join(posts) + '\n\n')
# Function to process the content of a posts file
def process_posts_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
content = file.read()
# Remove extra line breaks, links, emojis, tags,
# currency symbols, group mentions, and extra spaces
content = re.sub(r'\n{2,}', '\n\n', content)
content = re.sub(r'.*https?://\S+.*\n?', '', content)
content = re.sub(r'.*vk.me/.*\n?', '', content)
content = re.sub(r'.*vk.com/.*\n?', '', content)
content = re.sub(r'.*t.me.*\n?', '', content)
content = re.sub(r'.*ССЫЛКА В ИСТОЧНИКЕ.*\n?', '', content)
content = re.sub(r'.*Ссылка в источнике.*\n?', '', content)
content = re.sub(r'.*cсылка в источнике.*\n?', '', content)
content = re.sub(r'.*в источнике.*\n?', '', content)
content = re.sub(r'.*Чекай источник.*\n?', '', content)
content = re.sub(r'.*чекай источник.*\n?', '', content)
content = re.sub(r'.*СМОТРИ ИСТОЧНИК.*\n?', '', content)
content = re.sub(r'.*Жми на источник.*\n?', '', content)
content = re.sub(r'.*жми на источник.*\n?', '', content)
content = re.sub(r'.*посмотреть источник.*\n?', '', content)
content = re.sub(r'.*Посмотреть источник.*\n?', '', content)
content = re.sub(r'.*Источник снизу.*\n?', '', content)
content = re.sub(r'.*источник снизу.*\n?', '', content)
content = re.sub(r'.*Смотри источник.*\n?', '', content)
content = re.sub(r'.*смотри источник.*\n?', '', content)
content = re.sub(r'.*в источник.*\n?', '', content)
content = re.sub(r'.*в ucтoчнuк.*\n?', '', content)
content = re.sub(r'.*В источнике.*\n?', '', content)
content = re.sub(r'.*ссылка в комментах.*\n?', '', content)
content = re.sub(r'.*\(ссылка в источнике\).*\n?', '', content)
content = re.sub(r'.*\(в источник\).*\n?', '', content)
content = re.sub(r'.*₽.*\n?', '', content)
content = re.sub(r'\[club\d+\|[^\]]+\]', '', content)
content = re.sub(r'\[id\d+\|[^\]]+\]', '', content)
content = content.replace('\xa0', ' ')
content = content.replace('\u200B', '')
content = demoji.replace(content, '')
content = re.sub(r'\n\n+', '\n\n', content.strip())
content = re.sub(r' +', ' ', content)
lines = content.split('\n\n')
unique_lines = list(set(lines))
content = '\n\n'.join(unique_lines)
with open(filename, 'w', encoding='utf-8') as file:
# Write the processed content back to the file
file.write(content)
# Set the VKontakte group domain and access token
group_domain = 'group_domain' # Replace with the actual group ID
access_token = 'access_token' # Replace with your VKontakte API access token
# Enter the number of posts to process
count = int(input('Enter the number of posts to process: '))
# Set the filename to save the posts
filename = f'{group_domain}.txt'
print(' ')
print('Please wait...\n')
# Get the group ID
group_id = get_group_id(group_domain, access_token)
# Get posts from the group
posts = get_posts(group_id, access_token, count)
# Save the posts to a file
save_posts_to_file(posts, filename)
# Process the posts file
process_posts_file(filename)
# Display information about the saved and processed posts
total_posts = count
valid_posts = len([post for post in posts if post.strip()])
print('Posts saved to file and processed.\n')
print(f'Total processed: {total_posts}')
print(f'After filtering: {valid_posts}')