from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ' '.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
# now use strip_tags to remove html tags from string
def read_json_from_file(file_path):
with open(file_path, 'r') as infile:
return json.load(infile)
def write_json_to_file(file_path, token_map):
with open(file_path, 'w') as f:
json.dump(token_map, f)
import csv
with open(file, 'r') as infile:
reader = csv.reader(infile, delimiter=',')
for row in reader:
print row[0], row[1]
import csv
numbers = range(10)
with open(file, 'r') as outfile:
writer = csv.writer(outfile, delimiter=',')
for number in numbers:
writer.writerow([number, ])
dict_as_list = sorted(dict_object.items(), key=lambda x: x[1], reverse=True)
from os.path import join
import requests
def download_file(url, file_path):
try:
response = requests.get(url)
except Exception as e:
print 'Failed to download file from {} with error as {}'.format(url, e)
return False
if response.status_code == requests.codes.ok:
with open(file_path, 'wb') as infile:
infile.write(response.content)
return True
return False
import re
def remove_multiple_space(s):
return re.sub('\s+', ' ', s)
import functools
import multiprocessing.pool
def timeout(max_timeout):
"""Timeout decorator, parameter in seconds."""
def timeout_decorator(item):
"""Wrap the original function."""
@functools.wraps(item)
def func_wrapper(*args, **kwargs):
"""Closure for function."""
pool = multiprocessing.pool.ThreadPool(processes=1)
async_result = pool.apply_async(item, args, kwargs)
# raises a TimeoutError if execution exceeds max_timeout
return async_result.get(max_timeout)
return func_wrapper
return timeout_decorator
# value is in seconds
@timeout(2)
def my_function():
print 'Function which requires timeout'