-
Notifications
You must be signed in to change notification settings - Fork 1
/
loaddata.py
24 lines (22 loc) · 839 Bytes
/
loaddata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
import os
def clean_data(user_query_list, filepath):
re_data = pd.read_csv(filepath, index_col=0).dropna()
re_data.drop_duplicates(inplace=True)
re_data.name = user_query_list[0]
return re_data
def load_data_to_file_system(df, currentdb=None):
if currentdb is not None:
if os.path.exists("./" + df.name + "_chunks"):
print("Dataset already chunked!")
else:
os.mkdir("./" + df.name + "_chunks")
else:
print("Not in a database, please enter a database with USEDB first")
i = 0
chunk_count = 1
for j in range(10000, len(df), 10000):
df.iloc[i:j].to_csv("./" + df.name + "_chunks/" + df.name + "_chunk" + str(chunk_count) + ".csv")
chunk_count += 1
i += 10000
print("Created ", str(chunk_count), " files.")