-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
80 lines (65 loc) · 3.48 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from src.harvest import harvest_oai, collections
from src.convert import oai_to_dataframe
import src.curate as curate
from datetime import timedelta
import time
import sys
if __name__ == "__main__":
start_time = time.time()
key = sys.argv[1]
valid_keys = ['enb_books'] + list(collections.keys())
if key not in valid_keys:
raise ValueError(f"Invalid collection: {key}. Valid collections are: {valid_keys}")
if key == "enb_books":
for k in ["enb_estonian_books", "enb_non_estonian_books"]:
# harvest and save the raw XML file
print(f"\nHarvesting {collections[k]['title']}")
harvest_oai(key=k, savepath=f"data/raw/{k}.xml")
# take the raw XML file, convert it to a dataframe and save it
print(f"\nConverting {k} to dataframe")
df = oai_to_dataframe(f"data/raw/{k}.xml", rename_columns=False)
df.to_parquet(f"data/converted/{k}.parquet")
# concatenate the dataframes for cleaning
import pandas as pd
enb_est = pd.read_parquet("data/converted/enb_estonian_books.parquet")
enb_non = pd.read_parquet("data/converted/enb_non_estonian_books.parquet")
df = pd.concat([enb_est, enb_non]).reset_index(drop=True)
del(enb_est, enb_non) # free up RAM
df.to_parquet(f"data/converted/{key}.parquet")
# clean and filter the converted dataframe
print("\nCleaning dataframe")
df = curate.curate_books(df)
df = curate.organize_columns(df, collection_type="books")
df.to_parquet(f"data/curated/{key}.parquet")
elif key == "persons":
# harvest and save the raw XML file
print(f"\nHarvesting {collections[key]['title']}")
harvest_oai(key=key, savepath=f"data/raw/{key}.xml")
# take the raw XML file, convert it to a dataframe and save it
print(f"\nConverting {key} to dataframe")
df = oai_to_dataframe(f"data/raw/{key}.xml", rename_columns=False)
df.to_parquet(f"data/converted/{key}.parquet")
# clean and filter the converted dataframe
print("\nCleaning dataframe")
df = curate.curate_persons(df)
df = curate.organize_columns(df, collection_type="persons")
df.to_parquet(f"data/curated/{key}.parquet")
else:
# harvest and save the raw XML file
print(f"\nHarvesting {collections[key]['title']}")
harvest_oai(key=key, savepath=f"data/raw/{key}.xml")
# take the raw XML file, convert it to a dataframe and save it
print(f"\nConverting {key} to dataframe")
df = oai_to_dataframe(f"data/raw/{key}.xml", rename_columns=False)
df.to_parquet(f"data/converted/{key}.parquet")
# clean and filter the converted dataframe
print("\nProcessing dataframe")
print("Warning: some of the columns in this collection do not yet have custom cleaning functions. Cleaning will proceed as if the collection were 'enb_books', but the result may be partially incorrect. Please check 'curate.py' for reference.")
df = curate.curate_books(df)
df = curate.organize_columns(df, collection_type="books")
df.to_parquet(f"data/curated/{key}.parquet")
# df.to_csv(f"data/curated/{key}.tsv", sep="\t", encoding="utf8", index=False)
end_time = time.time()
elapsed_time = end_time - start_time # Calculate the elapsed time
formatted_time = str(timedelta(seconds=elapsed_time)) # Format the elapsed time
print(f"\nCompleted in {formatted_time}")