Skip to content

Commit

Permalink
fix lintcode crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
billryan committed Jul 22, 2018
1 parent dc8decd commit 0505b58
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 27 deletions.
55 changes: 36 additions & 19 deletions scripts/lintcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# -*- coding: utf-8 -*-


from pyquery import PyQuery as pq
# from pyquery import PyQuery as pq
import requests


class Lintcode(object):
Expand All @@ -11,36 +12,52 @@ def __init__(self):
self.driver = None

def open_url(self, url):
self.url = url
print('open URL: {}'.format(url))
self.driver = pq(url=url)
url = url.strip('description')
url = url.strip('/')
self.url = url
lintcode_unique_name = url.split('/')[-1]
req_url = 'https://www.lintcode.com/api/problems/detail/?unique_name_or_alias={}&_format=detail'.format(lintcode_unique_name)
self.driver = requests.get(req_url).json()

def get_title(self):
print('get title...')
title = self.driver('title').text()
title = self.driver['title']
return title

def get_description(self):
print('get description...')
desc_pq = self.driver('#description')
desc_html = desc_pq('.m-t-lg:nth-child(1)').html()
example_html = desc_pq('.m-t-lg:nth-child(2)').html()
return desc_html + example_html
desc = self.driver['description']
notice = self.driver['notice']
clarification = self.driver['clarification']
example = self.driver['example']
challenge = self.driver['challenge']
desc_full = desc
if notice:
desc_full += '\n\n#### Notice\n\n' + notice
if clarification:
desc_full += '\n\n#### Clarification\n\n' + clarification
if example:
desc_full += '\n\n#### Example\n\n' + example
if challenge:
desc_full += '\n\n#### Challenge\n\n' + challenge

return desc_full

def get_difficulty(self):
print('get difficulty...')
progress_bar = self.driver('.progress-bar')
original_title = progress_bar.attr('data-original-title')
splits = original_title.strip().split(' ')
difficulty = splits[1]
ac_rate = splits[-1]
mapping = {1: 'Easy', 2: 'Medium', 3: 'Hard'}
difficulty = mapping.get(self.driver['level'], 'unknown')
return difficulty

def get_tags(self):
print('get tags...')
tags = []
for i in self.driver('#tags.tags a'):
tags.append(i.text)
for i in self.driver['tags']:
if i['alias']:
tags.append(i['alias'])
else:
tags.append(i['name'])
return tags

def _get_related(self):
Expand All @@ -67,12 +84,12 @@ def get_problem_all(self, url):
'difficulty': difficulty,
'tags': tags,
'description': description,
'url': self._clean_url(url)
'url': self.url
}
return problem


if __name__ == '__main__':
url = 'http://www.lintcode.com/en/problem/palindrome-number/'
leetcode = Lintcode()
print(leetcode.get_problem_all(url))
url = 'https://www.lintcode.com/problem/topological-sorting'
lintcode = Lintcode()
print(lintcode.get_problem_all(url))
8 changes: 5 additions & 3 deletions scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ def curr_time():
problem_md = ''
problem_slug = ''
xxxcode = None
convert_desc = True
if raw_url.startswith('https://leetcode'):
xxxcode = Leetcode()
elif raw_url.startswith('http://www.lintcode.com'):
elif raw_url.startswith('https://www.lintcode.com'):
xxxcode = Lintcode()
convert_desc = False
problem = xxxcode.get_problem_all(raw_url)
problem_slug = slugify(problem['title'], separator="_")
problem_md = problem2md(problem)
problem_md = problem2md(problem, convert_desc)

if args.dir:
post_dir = os.path.join(ROOTDIR, args.dir)
post_fn = os.path.join(post_dir, problem_slug + '.md')
Expand Down
12 changes: 7 additions & 5 deletions scripts/ojhtml2markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,21 @@ def content(self):
def leet_lint_url(url):
problem_slug = url.strip('/').split('/')[-1]
leetcode_url = 'https://leetcode.com/problems/{}/'.format(problem_slug)
lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(problem_slug)
lintcode_url = 'https://www.lintcode.com/problem/{}/'.format(problem_slug)
urls = {}
for url in [leetcode_url, lintcode_url]:
response = requests.head(url)
if response.status_code != 404:
if url.startswith('https://leetcode'):
urls['leetcode'] = url
elif url.startswith('http://www.lintcode'):
elif url.startswith('https://www.lintcode'):
urls['lintcode'] = url
else:
print('cannot find url with: {}'.format(url))
return urls


def problem2md(problem):
def problem2md(problem, convert_desc=True):
metadata = {
'title': problem['title'],
'difficulty': problem['difficulty']
Expand All @@ -49,8 +49,10 @@ def problem2md(problem):
metadata['tags'] = problem['tags']

description = problem['description']
h = html2text.HTML2Text()
description_md = h.handle(description)
description_md = description
if convert_desc:
h = html2text.HTML2Text()
description_md = h.handle(description)

lines = []
lines.append('# ' + problem['title'] + '\n')
Expand Down

0 comments on commit 0505b58

Please sign in to comment.