-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhello-leetcode.py
200 lines (153 loc) · 7.48 KB
/
hello-leetcode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
import csv
import re
import os
import pandas as pd
import re
pd.set_option('display.width', 1000) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
# 配置
# executable_path = '/Users/liwei/geckodriver'
executable_path = "C:\\Users\\Administrator\\Desktop\\crawl-leetcode\\geckodriver-v0.19.1-win64\\geckodriver.exe"
leetcode_url_en = 'https://leetcode.com/problemset/all/'
leetcode_url_cn = 'https://leetcode-cn.com/problemset/all/'
question_select_en = "" # question-app > div > div:nth-child(2) > div.question-list-base > div.table-responsive.question-list-table > table > tbody.reactable-pagination > tr > td > span.row-selector > select""
question_select_cn = "#question-app > div > div:nth-child(2) > div.question-list-base > div.table-responsive.question-list-table > table > tbody.reactable-pagination > tr > td > span > select"
select_visible_text_en = "all"
select_visible_text_cn = "全部"
html_name_en = "leetcode-en.html"
html_name_cn = "leetcode-cn.html"
problems_file_name_en = "problems-en.csv"
problems_file_name_cn = "problems-cn.csv"
url_prefix_en = "https://leetcode.com/"
url_prefix_cn = "https://leetcode-cn.com/"
template_file_name = "leetcode-solution-template.md"
dir_name = 'out'
# 第 1 步:下载中文网页
def download_page_of_all_link(executable_path, url, select_visible_text, html_name):
driver = webdriver.Firefox(executable_path=executable_path)
driver.get(url=url) # 让浏览器访问网页
driver.maximize_window()
s1 = Select(driver.find_element_by_css_selector(question_select_cn))
s1.select_by_visible_text(select_visible_text)
content = driver.page_source
with open(html_name, 'w', encoding='utf-8') as fw:
fw.write(content)
driver.close()
class Problem:
num = ""
difficulty = ""
acp = ""
title_cn = ""
title_en = ""
link = ""
# 解析中文网页
def parse_page(problems, html_name, url_prefix):
with open(html_name, 'r', encoding='utf-8') as fr:
html = fr.read();
soup = BeautifulSoup(html, 'lxml')
lines = soup.select(".reactable-data tr")
for line in lines:
p = Problem()
p.num = line.select("td")[1].text # 序号
p.acp = line.select("td")[4].text # 通过率
p.difficulty = line.select("td")[5].text # 题目难度
p.title_cn = line.select("td")[2].text.strip() # 题目名称(中文)
p.title_en = line.select("td")[2].attrs['value'] # 题目名称(英文)
link_suffix = line.select("td")[2].select("div a")[0].attrs['href']
link = urljoin(url_prefix, link_suffix) + '/description/' # 题目名称(英文)
p.link = link
problems.append([p.num, p.title_cn, p.link, p.difficulty, p.acp, p.title_en])
print("一共有 {} 题".format(len(problems)))
for p in problems:
print(p)
def write_to_csv(problems, problems_file_name):
with open(problems_file_name, 'w', encoding='utf-8', newline='') as fw:
writer = csv.writer(fw)
for row in problems:
writer.writerow(row)
def download_csv():
download_page_of_all_link(executable_path, leetcode_url_en, select_visible_text_en, html_name_en)
problems_en = []
parse_page(problems_en, html_name_en, url_prefix_en)
write_to_csv(problems_en, problems_file_name_en)
download_page_of_all_link(executable_path, leetcode_url_cn, select_visible_text_cn, html_name_cn)
problems_cn = []
parse_page(problems_cn, html_name_cn, url_prefix_cn)
write_to_csv(problems_cn, problems_file_name_cn)
print('第 1 步:爬取 LeetCode 中英文网站完成!')
# 根据文件名创建文件(模板内容固定)
def touch_one_solution_file(file_name):
with open(template_file_name, 'r', encoding='utf-8') as fr:
content = fr.read()
file_name = "out" + os.sep + file_name
with open(file_name, 'w', encoding='utf-8') as fw:
fw.write(content)
# 读 LeetCode 问题列表,并且创建文件名
def read_problems_list():
with open("problems-en.csv", 'r', encoding="utf-8") as fr:
csv_reader = csv.reader(fr)
for row in csv_reader:
number = "{:0>4d}".format(int(row[0]))
title = re.search("/problems/(.*?)/description/", row[2]).group(1)
file_name = "leetcode-" + number + "-" + title + ".md"
touch_one_solution_file(file_name)
def mkdir():
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def merge_en_cn_problems():
dataframe_en = pd.read_csv(problems_file_name_en, header=None)
dataframe_en.columns = ["num", "title_en", "link_en", "difficulty", "acp", "title_en2"]
dataframe_cn = pd.read_csv(problems_file_name_cn, header=None)
dataframe_cn.columns = ["num", "title_cn", "link_cn", "difficulty", "acp", "title_en"]
question_descriptions_file_name = "question_descriptions.csv"
dataframe_question_descriptions = pd.read_csv(question_descriptions_file_name, header=None)
dataframe_question_descriptions.columns = ["num", "question_descriptions"]
result = pd.merge(dataframe_en, dataframe_cn, on='num')
result = pd.merge(result, dataframe_question_descriptions, on='num')
result.to_csv("leet-code-en-cn-merge.csv", index=None)
print("第 2 步:中英文 csv 文件合并完成!")
template_content = ""
def read_from_template():
global template_content
with open('leetcode-solution-template.md', 'r', encoding='utf-8') as fr:
template_content = fr.read()
def get_difficulty_html(difficulty):
if difficulty == 'Easy':
return '<span class="label label-warning">简单</span>'
if difficulty == 'Medium':
return '<span class="label label-warning">中等</span>'
if difficulty == 'Hard':
return '<span class="label label-warning">困难</span>'
def transform_template(row):
template = template_content
template = re.sub('\$title\$', str(row['num']) + '. ' + row['title_en_x'], template)
template = re.sub('\$difficulty\$', get_difficulty_html(row['difficulty_x']), template)
template = re.sub('\$link-en\$', row['link_en'], template)
template = re.sub('\$title-en\$', str(row['num']) + '. ' + row['title_en_x'], template)
template = re.sub('\$link-cn\$', row['link_cn'], template)
template = re.sub('\$title-cn\$', str(row['num']) + '. ' + row['title_cn'], template)
template = re.sub('\$question_descriptions\$', row['question_descriptions'], template)
return template
def read_template_and_replace():
read_from_template()
dataframe = pd.read_csv('leet-code-en-cn-merge.csv', index_col=None)
for index, row in dataframe.iterrows():
content = transform_template(row)
number = "{:0>4d}".format((row['num']))
title = re.search("/problems/(.*?)/description/", row['link_en']).group(1)
# 文件全名,创建它
file_name = "leetcode-" + number + "-" + title + ".md"
file_name = "out" + os.sep + file_name
with open(file_name, 'w', encoding='utf-8') as fw:
fw.write(content)
print("第 3 步:读取模板并且替换完成!")
if __name__ == '__main__':
mkdir()
# download_csv()
merge_en_cn_problems()
read_template_and_replace()