Skip to content

Commit 0484393

Browse files
committed
update automation addition
1 parent 5ea8749 commit 0484393

11 files changed

+913
-705
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,6 @@ View and explore the [RDF mappings](https://github.com/dtai-kg/MLSea-Discover/tr
8585

8686
Generate the RDF dumps of MLSea-KG by running:
8787

88-
python data_integration.py
88+
python data_integration_openml.py
89+
python data_integration_kaggle.py
90+
python data_integration_pwc.py

resource_code/config.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,27 @@
55
PWC_INPUT = "/Users/ioannisdasoulas/Desktop/ML-Discovery/ML-KG/Data/PwC-Data/"
66
OUTPUT_PATH = "/Users/ioannisdasoulas/Desktop/ML-Discovery/ML-KG/RDF_Dumps/"
77
ORIGINAL_DATA_FOLDER = "Original-Data/"
8-
#UPDATE_MONTH_FOLDER = "December2023/"
8+
UPDATE_MONTH_FOLDER = "10-01-2024/"
99

1010
# OpenML API Checkpoints
11-
OPENML_RUN_CHECKPOINT = 4037070
11+
OPENML_RUN_CHECKPOINT = 4037082
1212
OPENML_RUN_CURRENT_OFFSET = 6000000
13-
OPENML_DATASET_CHECKPOINT = 5399
14-
OPENML_FLOW_CHECKPOINT = 47250
15-
OPENML_TASK_CHECKPOINT = 16736
13+
OPENML_DATASET_CHECKPOINT = 5402
14+
OPENML_FLOW_CHECKPOINT = 16751
15+
OPENML_TASK_CHECKPOINT = 47250
16+
17+
# Dumps current file number
18+
OPENML_TASK_DUMP_PART = 1
19+
OPENML_FLOW_DUMP_PART = 1
20+
OPENML_DATASET_DUMP_PART = 1
21+
OPENML_RUN_DUMP_PART = 29
22+
KAGGLE_DUMP_PART = 1
23+
PWC_DUMP_PART = 1
24+
25+
# Triples limit per dump
26+
OPENML_DUMP_LIMIT = 50000000
27+
KAGGLE_DUMP_LIMIT = 30000000
28+
PWC_DUMP_LIMIT = 20000000
1629

1730
def update_openml_checkpoints(run_cp, dataset_cp, task_cp, flow_cp):
1831

@@ -21,10 +34,10 @@ def update_openml_checkpoints(run_cp, dataset_cp, task_cp, flow_cp):
2134
content = file.read()
2235

2336
# Update the values in memory
24-
content = content.replace('OPENML_RUN_CHECKPOINT = 4037070', 'OPENML_RUN_CHECKPOINT = ' + str(run_cp))
25-
content = content.replace('OPENML_DATASET_CHECKPOINT = 5399', 'OPENML_DATASET_CHECKPOINT = ' + str(dataset_cp))
26-
content = content.replace('OPENML_FLOW_CHECKPOINT = 47250', 'OPENML_FLOW_CHECKPOINT = ' + str(task_cp))
27-
content = content.replace('OPENML_TASK_CHECKPOINT = 16736', 'OPENML_TASK_CHECKPOINT = ' + str(flow_cp))
37+
content = content.replace('OPENML_RUN_CHECKPOINT = 4037082', 'OPENML_RUN_CHECKPOINT = ' + str(run_cp))
38+
content = content.replace('OPENML_DATASET_CHECKPOINT = 5402', 'OPENML_DATASET_CHECKPOINT = ' + str(dataset_cp))
39+
content = content.replace('OPENML_FLOW_CHECKPOINT = 16751', 'OPENML_FLOW_CHECKPOINT = ' + str(flow_cp))
40+
content = content.replace('OPENML_TASK_CHECKPOINT = 47250', 'OPENML_TASK_CHECKPOINT = ' + str(task_cp))
2841

2942
# Write the changes back to the constants.py file
3043
with open('config.py', 'w') as file:

0 commit comments

Comments
 (0)