-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.py
34 lines (27 loc) · 1008 Bytes
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
df = pd.read_csv('training_set.csv')
print(df.head())
# remove html tags from the raw answer
import re
import numpy as np
def remove_html_contents (input_txt):
TAG_HTML = re.compile(r'<.*?>')
TAG_SPECIAL_CHAR = re.compile(r'&.*?;')
TAG_R = re.compile(r'\s')
for index, text in enumerate(input_txt):
if pd.isnull(text):
continue
text = re.sub(TAG_R, ' ', text)
text = re.sub(TAG_HTML, ' ', text)
text = re.sub(TAG_SPECIAL_CHAR, ' ', text)
input_txt[index] = text
return input_txt
# remove html tags
columns_to_clean = [ 'question_text', 'raw_answer_text', 'cleaned_answer_text']
# columns_to_clean = [ 'raw_answer_text' ]
for column_name in columns_to_clean:
# df[column_name+'_clean'] = remove_html_contents(df[column_name])
print("\n", column_name, ": \n", df[column_name][:5])
remove_html_contents(df[column_name])
print("\n",column_name, ": \n",df[column_name][:5])
df.to_csv('cleaned.csv')