From 3e1a9c85a9e024f8bbe2922c8d1a572ae2a0234f Mon Sep 17 00:00:00 2001
From: parisa-zahedi
Date: Fri, 10 Jan 2025 13:30:17 +0100
Subject: [PATCH 1/2] merge multiple scripts in one
---
README.md | 141 ++++++++----------
config.json | 33 ++--
pyproject.toml | 11 +-
..._generate_output.py => generate_output.py} | 0
scripts/step1_filter_articles.py | 101 -------------
scripts/step2_categorize_by_timestamp.py | 117 ---------------
scripts/step3_select_final_articles.py | 119 ---------------
7 files changed, 95 insertions(+), 427 deletions(-)
rename scripts/{step4_generate_output.py => generate_output.py} (100%)
delete mode 100644 scripts/step1_filter_articles.py
delete mode 100644 scripts/step2_categorize_by_timestamp.py
delete mode 100644 scripts/step3_select_final_articles.py
diff --git a/README.md b/README.md
index a32351b..495938c 100644
--- a/README.md
+++ b/README.md
@@ -20,30 +20,11 @@ To install and run this project you need to have the following prerequisites ins
```
### Installation
-#### Option 1 - Install dataQuest package
To run the project, ensure to install the dataQuest package that is part of this project.
```
pip install dataQuest
```
-#### Option 2 - Run from source code
-If you want to run the scripts without installation you need to:
-- Install requirement
-```commandline
-pip install setuptools wheel
-python -m pip install build
-```
-Change your current working directory to the location of your pyproject.toml file.
-```
-python -m build
-pip install .
-```
-- Set PYTHONPATH environment:
-On Linux and Mac OS, you might have to set the PYTHONPATH environment variable to point to this directory.
-
-```commandline
-export PYTHONPATH="current working directory/dataQuest:${PYTHONPATH}"
-```
### Built with
These packages are automatically installed in the step above:
* [scikit-learn](https://scikit-learn.org/stable/)
@@ -93,92 +74,96 @@ extractor.extract_xml_string()
Navigate to scripts folder and run:
```
-python3 convert_input_files.py --input_dir path/to/raw/xml/data --output_dir path/to/converted/json/compressed/output
+python3 convert_input_files.py
+ --input_dir path/to/raw/xml/data
+ --output_dir path/to/converted/json/compressed/output
```
#### Customize input-file
-In order to define a corpus with a new data format you should:
+In order to add a new corpus to dataQuest you should:
+- prepare your input data in the JSON format explained above.
- add a new input_file_type to [INPUT_FILE_TYPES](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/__init__.py)
- implement a class that inherits from [input_file.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/input_file.py).
This class is customized to read a new data format. In our case-study we defined [delpher_kranten.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/delpher_kranten.py).
-### 2. Filtering
-In this step, you may select articles based on a filter or a collection of filters. Articles can be filtered by title, year, decade, or a set of keywords defined in the ```config.json``` file.
+### 2. Filter articles
+You can select articles based on a single filter or a combination of filters. Articles can be filtered by title, year,
+decade, or a set of keywords defined in the ```config.json``` file. Logical operators such as AND, OR, and NOT can be used to combine filtering expressions.
+
+In the following example, you select articles that include any of the specified keywords AND were published between 1800 and 1910 AND do not
+contain advertisements (e.g., "Advertentie").
```commandline
"filters": [
{
"type": "AndFilter",
- "filters": [
- {
- "type": "OrFilter",
- "filters": [
+ "filters": [
{
"type": "YearFilter",
"start_year": 1800,
"end_year": 1910
},
{
- "type": "DecadeFilter",
- "decade": 1960
+ "type": "NotFilter",
+ "filter": {
+ "type": "ArticleTitleFilter",
+ "article_title": "Advertentie"
+ },
+ "level": "article"
+ },
+ {
+ "type": "KeywordsFilter",
+ "keywords": ["sustainability", "green"]
}
- ]
- },
- {
- "type": "NotFilter",
- "filter": {
- "type": "ArticleTitleFilter",
- "article_title": "Advertentie"
- },
- "level": "article"
- },
- {
- "type": "KeywordsFilter",
- "keywords": ["sustainability", "green"]
- }
- ]
+ ]
}
- ]
+ ],
```
-run the following to filter the articles:
-```commandline
-python3 scripts/step1_filter_articles.py --input-dir "path/to/converted/json/compressed/output/" --output-dir "output_filter/" --input-type "delpher_kranten" --glob "*.gz"
-```
-In our case, input-type is "delpher_kranten", and input data is a set of compresed json files with ```.gz``` extension.
+To select the most relevant articles:
+1. articles are selected based the filters in the config file
-The output of this script is a JSON file for each selected article in the following format:
-```commandline
-{
- "file_path": "output/transfered_data/00/KRANTEN_KBPERS01_000002100.json.gz",
- "article_id": "5",
- "Date": "1878-04-29",
- "Title": "Opregte Haarlemsche Courant"
-}
-```
-### 3. Categorization by timestamp
-The output files generated in the previous step are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/temporal_categorization/__init__.py),
-such as ```year``` or ```decade```. This categorization is essential for subsequent steps, especially if you intend to apply tf-idf or other models to specific periods. In our case, we applied tf-idf per decade.
+
+2. selected articles are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/temporal_categorization/__init__.py),
+such as ```year``` or ```decade```. This categorization is essential for subsequent steps, especially in case of applying tf-idf or other models to specific periods.
+
+
+3. Select the most relevant articles related to the specified topic (defined by the provided keywords).
+ 3.1. Select articles that contain any of the specified keywords in their title.
+
+ 3.2. Utilize TF-IDF (the default model), which can be extended to other models.
```commandline
-python3 scripts/step2_categorize_by_timestamp.py --input-dir "output_filter/" --glob "*.json" --period-type "decade" --output-dir "output_timestamped/"
+python3 scripts/filter_articles.py
+ --input-dir "path/to/converted/json/compressed/"
+
+ --output-dir "output/"
+
+ --input-type "delpher_kranten"
+
+ --glob "*.gz"
+
+ --period-type "decade"
```
-The output consists of a .csv file for each period, such as one file per decade, containing the ```file_path``` and ```article_id``` of selected articles.
+In our case:
+- The input data consists of compressed JSON files with the .gz extension.
+- The input type is "delpher_kranten".
+- Selected articles are categorized by decade.
-### 4. Select final articles
-This step is applicable when articles are filtered (in step 2) using a set of keywords.
-By utilizing tf-idf, the most relevant articles related to the specified topic (defined by the provided keywords) are selected.
-Before applying tf-idf, articles containing any of the specified keywords in their title are selected.
+#### Output
+The output consists of a .csv file for each period, such as one file per decade. Each file contains the ```file_path``` and ```article_id``` of the filtered articles,
+along with an additional column, ```selected```, which indicates the articles labeled as the most relevant by the model (e.g., TF-IDF).
-From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json):
+There are different strategies for selecting the final articles. You should specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json):
-- Percentage of selected articles with the top scores
-- Maximum number of selected articles with the top scores
-- Threshold for the value of cosine similarity between the embeddings of list of keywords and each article.
+- Percentage: Select a percentage of articles with the highest scores.
+- Maximum Number: Specify the maximum number of articles to select based on their scores.
+
+- Threshold: Set a threshold for the cosine similarity value between the embeddings of the keyword list and each article.
```commandline
"article_selector":
@@ -204,12 +189,8 @@ From the rest of articles, to choose the most relevant ones, you can specify one
},
```
-The following script, add a new column, ```selected``` to the .csv files from the previous step.
-```commandline
-python3 scripts/step3_select_final_articles.py --input-dir "output/output_timestamped/"
-```
-### 5. Generate output
+### 3. Generate output
As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text.
This feature can be set in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json).
```commandline
@@ -225,7 +206,10 @@ OR
```
```commandline
-python3 scripts/step4_generate_output.py --input-dir "output/output_timestamped/” --output-dir “output/output_results/“ --glob “*.csv”
+python3 scripts/generate_output.py
+--input-dir "output/output_timestamped/”
+--output-dir “output/output_results/“
+--glob “*.csv”
```
## About the Project
**Date**: February 2024
@@ -239,7 +223,6 @@ Pim Huijnen (p.huijnen@uu.nl)
- Parisa Zahedi (p.zahedi@uu.nl)
- Shiva Nadi (s.nadi@uu.nl)
-- Matty Vermet (m.s.vermet@uu.nl)
### License
diff --git a/config.json b/config.json
index 84f6a9a..7fb7e97 100644
--- a/config.json
+++ b/config.json
@@ -1,13 +1,28 @@
{
- "filters": [
- {
- "type": "KeywordsFilter",
- "keywords": ["windkracht", "windenergie", "windenergiebranche", "windturbine", "windstroom",
- "zonne-energie", "zonnewarmte", "zonnestraling", "geothermische energie", "aardwarmte",
- "waterkracht", "waterkrachtcentrale", "waterkrachtwerken", "waterstof", "waterstofenergie",
- "hydroturbine", "getijden-energie", "ethanol-produktie", "ethanol"]
- }
- ],
+ "filters": [
+ {
+ "type": "AndFilter",
+ "filters": [
+ {
+ "type": "YearFilter",
+ "start_year": 1800,
+ "end_year": 1910
+ },
+ {
+ "type": "NotFilter",
+ "filter": {
+ "type": "ArticleTitleFilter",
+ "article_title": "Advertentie"
+ },
+ "level": "article"
+ },
+ {
+ "type": "KeywordsFilter",
+ "keywords": ["sustainability", "green"]
+ }
+ ]
+ }
+ ],
"article_selector":
{
"type": "percentage",
diff --git a/pyproject.toml b/pyproject.toml
index 7ef3a03..2420ec4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,9 +6,8 @@ build-backend = "setuptools.build_meta"
name = "dataQuest"
description = "A package to extract hystorical news sentiments"
authors = [
- {name = "Shiva Nadi", email = "s.nadi@uu.nl"},
{name = "Parisa Zahedi", email = "p.zahedi@uu.nl"},
- {name = "Matty Vermet", email = "m.s.vermet@uu.nl"}
+ {name = "Shiva Nadi", email = "s.nadi@uu.nl"}
]
readme = "README.md"
requires-python = ">=3.8"
@@ -52,3 +51,11 @@ module = [
"sklearn.*"
]
ignore_missing_imports = true
+
+[project.scripts]
+filter-articles = "dataQuest.filter_articles:cli"
+generate-output = "dataQuest.generate_output:cli"
+
+[tool.setuptools.package-data]
+"dataQuest" = ["config.json"]
+
diff --git a/scripts/step4_generate_output.py b/scripts/generate_output.py
similarity index 100%
rename from scripts/step4_generate_output.py
rename to scripts/generate_output.py
diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py
deleted file mode 100644
index 4e67b7a..0000000
--- a/scripts/step1_filter_articles.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-This script filter articles from input files according to
-specified configurations.
-"""
-
-import argparse
-from pathlib import Path
-from typing import Iterable
-from tqdm import tqdm
-from dataQuest.filter import INPUT_FILE_TYPES
-from dataQuest.filter.input_file import InputFile
-from dataQuest.utils import load_filters_from_config
-from dataQuest.utils import save_filtered_articles
-
-
-def filter_articles(
- input_dir: Path,
- glob_pattern: str,
- config_path: Path,
- input_type: str,
- output_dir: Path,
-):
- """
- Core functionality to process files, filter articles, and save results.
-
- Args:
- input_dir (Path): Directory containing input files.
- glob_pattern (str): Glob pattern to match input files.
- config_path (Path): Path to the configuration file.
- input_type (str): File format of the input files.
- output_dir (Path): Directory to save filtered articles.
- """
- if not input_dir.is_dir():
- raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
-
- input_file_class = INPUT_FILE_TYPES[input_type]
- input_files: Iterable[InputFile] = [
- input_file_class(path) for path in input_dir.rglob(glob_pattern)
- ]
-
- output_dir.mkdir(parents=True, exist_ok=True)
-
- compound_filter = load_filters_from_config(config_path)
-
- for input_file in tqdm(input_files, desc="Filtering articles", unit="file"):
- for article in input_file.selected_articles(compound_filter):
- save_filtered_articles(input_file, article.id, output_dir)
-
-
-def cli():
- """
- Command-line interface for filter articles.
- """
- parser = argparse.ArgumentParser("Filter articles from input files.")
-
- parser.add_argument(
- "--input-dir",
- type=Path,
- help="Base directory for reading input files. ",
- )
- parser.add_argument(
- "--glob",
- type=str,
- required=True,
- help="Glob pattern for find input files; e.g. '*.gz' ",
- )
- parser.add_argument(
- "--config-path",
- type=Path,
- default="config.json",
- help="File path of config file.",
- )
- parser.add_argument(
- "--input-type",
- type=str,
- required=True,
- choices=list(INPUT_FILE_TYPES.keys()),
- help="Input file format.",
- )
- parser.add_argument(
- "--output-dir",
- type=Path,
- help="The directory for storing output files.",
- )
-
- args = parser.parse_args()
-
- try:
- filter_articles(
- input_dir=args.input_dir,
- glob_pattern=args.glob,
- config_path=args.config_path,
- input_type=args.input_type,
- output_dir=args.output_dir,
- )
- except ValueError as e:
- parser.error(str(e))
-
-
-if __name__ == "__main__":
- cli()
diff --git a/scripts/step2_categorize_by_timestamp.py b/scripts/step2_categorize_by_timestamp.py
deleted file mode 100644
index a77eac8..0000000
--- a/scripts/step2_categorize_by_timestamp.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-This script defines functions and classes to categorize files based
-on their timestamps.
-"""
-import argparse
-import logging
-from typing import Iterable
-from pathlib import Path
-import pandas as pd
-from tqdm import tqdm # type: ignore
-from dataQuest.temporal_categorization import PERIOD_TYPES
-from dataQuest.temporal_categorization.timestamped_data import TimestampedData
-
-OUTPUT_FILE_NAME = 'articles'
-FILENAME_COLUMN = 'file_path'
-ARTICLE_ID_COLUMN = 'article_id'
-
-
-def categorize_articles(
- input_dir: Path,
- period_type: str,
- glob_pattern: str,
- output_dir: Path,
-):
- """
- Core functionality to categorize articles by timestamp.
-
- Args:
- input_dir (Path): Directory containing input files.
- period_type (str): Type of time period to use for categorization.
- glob_pattern (str): Glob pattern to find input files (e.g., '*.json').
- output_dir (Path): Directory to save categorized files.
- """
- if not input_dir.is_dir():
- raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
-
- time_period_class = PERIOD_TYPES[period_type]
- timestamped_objects: Iterable[TimestampedData] = [
- time_period_class(path) for path in input_dir.rglob(glob_pattern)
- ]
-
- output_dir.mkdir(parents=True, exist_ok=True)
-
- for timestamped_object in tqdm(timestamped_objects,
- desc="Categorize by timestamp",
- unit="file"):
- try:
- timestamp = timestamped_object.categorize()
- timestamp_file_name = output_dir / f"{OUTPUT_FILE_NAME}_{timestamp}.csv"
-
- if timestamp_file_name.exists():
- df = pd.read_csv(timestamp_file_name)
- else:
- df = pd.DataFrame(columns=[FILENAME_COLUMN, ARTICLE_ID_COLUMN])
-
- new_row = {
- FILENAME_COLUMN: str(timestamped_object.data()[FILENAME_COLUMN]),
- ARTICLE_ID_COLUMN: str(timestamped_object.data()[ARTICLE_ID_COLUMN]),
- }
- df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
-
- df.to_csv(timestamp_file_name, index=False)
-
- except Exception as e: # pylint: disable=broad-except
- logging.error("Error processing timestamped object: %s", str(e))
-
-
-def cli():
- """
- Command-line interface for categorize articles by timestamp.
- """
- parser = argparse.ArgumentParser("Categorize articles by timestamp.")
-
- parser.add_argument(
- "--input-dir",
- type=Path,
- required=True,
- help="Base directory for reading input files.",
- )
- parser.add_argument(
- "--period-type",
- type=str,
- required=True,
- choices=list(PERIOD_TYPES.keys()),
- help="Time periods",
- )
- parser.add_argument(
- "--glob",
- type=str,
- required=True,
- default="*.json",
- help="Glob pattern for find input files; e.g. '*.json'.",
- )
- parser.add_argument(
- "--output-dir",
- type=Path,
- required=True,
- help="The directory for storing output files.",
- )
-
- args = parser.parse_args()
-
- try:
- categorize_articles(
- input_dir=args.input_dir,
- period_type=args.period_type,
- glob_pattern=args.glob,
- output_dir=args.output_dir,
- )
- except ValueError as e:
- parser.error(str(e))
- except Exception as e: # pylint: disable=broad-except
- logging.error("Error occurred in CLI: %s", str(e))
-
-
-if __name__ == "__main__":
- cli()
diff --git a/scripts/step3_select_final_articles.py b/scripts/step3_select_final_articles.py
deleted file mode 100644
index 969dc22..0000000
--- a/scripts/step3_select_final_articles.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Select final articles."""
-import argparse
-import logging
-from typing import List
-from pathlib import Path
-import pandas as pd
-from tqdm import tqdm # type: ignore
-from dataQuest.utils import get_keywords_from_config
-from dataQuest.utils import read_config
-from dataQuest.article_final_selection.process_articles import select_articles
-
-ARTICLE_SELECTOR_FIELD = "article_selector"
-
-
-def update_selected_indices_in_file(filepath: str,
- indices_selected: List[int]) -> None:
- """
- Update selected indices in a CSV file.
-
- Args:
- filepath (str): The path to the CSV file.
- indices_selected (List[int]): A list of indices to be marked
- as selected.
-
- Raises:
- ValueError: If indices_selected is empty or contains
- non-negative integers.
-
- """
- try:
- if indices_selected and all(isinstance(idx, int) and idx >= 0
- for idx in indices_selected):
- df = pd.read_csv(filepath)
- df['selected'] = 0
- df.loc[indices_selected, 'selected'] = 1
- df.to_csv(filepath, index=False)
- else:
- raise ValueError("Invalid indices_selected")
- except Exception as e: # pylint: disable=W0718
- logging.error("Error updating selected indices in file: %s",
- e)
-
-
-def select_final_articles(
- input_dir: Path,
- glob_pattern: str,
- config_path: Path,
-):
- """
- Core functionality to select final articles based on keywords and configuration.
-
- Args:
- input_dir (Path): Directory containing input files.
- glob_pattern (str): Glob pattern to match input files (e.g., '*.csv').
- config_path (Path): Path to the configuration file.
- """
- if not input_dir.is_dir():
- raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
-
- keywords = get_keywords_from_config(config_path)
- config_article_selector = read_config(config_path, ARTICLE_SELECTOR_FIELD)
-
- if len(keywords) > 0 and config_article_selector:
- for articles_filepath in tqdm(
- input_dir.rglob(glob_pattern),
- desc="Processing articles",
- unit="file",
- ):
- try:
- selected_indices = select_articles(
- str(articles_filepath), keywords, config_article_selector
- )
-
- update_selected_indices_in_file(str(articles_filepath), selected_indices)
- except Exception as e: # pylint: disable=broad-except
- logging.error("Error processing file %s: %s", articles_filepath, str(e))
-
-
-def cli():
- """
- Command-line interface for selecting final articles.
- """
- parser = argparse.ArgumentParser("Select final articles.")
-
- parser.add_argument(
- "--input-dir",
- type=Path,
- required=True,
- help="Base directory for reading input files.",
- )
- parser.add_argument(
- "--glob",
- type=str,
- default="*.csv",
- help="Glob pattern for find input files; e.g. '*.csv'.",
- )
- parser.add_argument(
- "--config-path",
- type=Path,
- default="config.json",
- help="File path of config file.",
- )
-
- args = parser.parse_args()
-
- try:
- select_final_articles(
- input_dir=args.input_dir,
- glob_pattern=args.glob,
- config_path=args.config_path,
- )
- except ValueError as e:
- parser.error(str(e))
- except Exception as e: # pylint: disable=broad-except
- logging.error("Error occurred in CLI: %s", str(e))
-
-
-if __name__ == "__main__":
- cli()
From 66d909054c0a050b8b521281be22d7c62a2284b8 Mon Sep 17 00:00:00 2001
From: parisa-zahedi
Date: Fri, 10 Jan 2025 13:31:44 +0100
Subject: [PATCH 2/2] merge three scripts in one
---
scripts/filter_articles.py | 248 +++++++++++++++++++++++++++++++++++++
1 file changed, 248 insertions(+)
create mode 100644 scripts/filter_articles.py
diff --git a/scripts/filter_articles.py b/scripts/filter_articles.py
new file mode 100644
index 0000000..4c8c13f
--- /dev/null
+++ b/scripts/filter_articles.py
@@ -0,0 +1,248 @@
+"""
+This script filter articles from input files according to
+specified configurations.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Iterable, List
+import pandas as pd
+from tqdm import tqdm
+
+from dataQuest.filter import INPUT_FILE_TYPES
+from dataQuest.filter.input_file import InputFile
+from dataQuest.utils import load_filters_from_config
+from dataQuest.utils import save_filtered_articles
+from dataQuest.temporal_categorization import PERIOD_TYPES
+from dataQuest.temporal_categorization.timestamped_data import TimestampedData
+from dataQuest.utils import get_keywords_from_config
+from dataQuest.utils import read_config
+from dataQuest.article_final_selection.process_articles import select_articles
+
+ARTICLE_SELECTOR_FIELD = "article_selector"
+OUTPUT_FILE_NAME = 'articles'
+FILENAME_COLUMN = 'file_path'
+ARTICLE_ID_COLUMN = 'article_id'
+
+
+def filter_articles(
+ input_dir: Path,
+ glob_pattern: str,
+ config_path: Path,
+ input_type: str,
+ output_dir: Path,
+):
+ """
+ Core functionality to process files, filter articles, and save results.
+
+ Args:
+ input_dir (Path): Directory containing input files.
+ glob_pattern (str): Glob pattern to match input files.
+ config_path (Path): Path to the configuration file.
+ input_type (str): File format of the input files.
+ output_dir (Path): Directory to save filtered articles.
+ """
+ if not input_dir.is_dir():
+ raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
+
+ input_file_class = INPUT_FILE_TYPES[input_type]
+ input_files: Iterable[InputFile] = [
+ input_file_class(path) for path in input_dir.rglob(glob_pattern)
+ ]
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ compound_filter = load_filters_from_config(config_path)
+
+ for input_file in tqdm(input_files, desc="Filtering articles", unit="file"):
+ for article in input_file.selected_articles(compound_filter):
+ save_filtered_articles(input_file, article.id, output_dir)
+
+
+def categorize_articles(
+ input_dir: Path,
+ period_type: str,
+ glob_pattern: str,
+ output_dir: Path,
+):
+ """
+ Core functionality to categorize articles by timestamp.
+
+ Args:
+ input_dir (Path): Directory containing input files.
+ period_type (str): Type of time period to use for categorization.
+ glob_pattern (str): Glob pattern to find input files (e.g., '*.json').
+ output_dir (Path): Directory to save categorized files.
+ """
+ if not input_dir.is_dir():
+ raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
+
+ time_period_class = PERIOD_TYPES[period_type]
+ timestamped_objects: Iterable[TimestampedData] = [
+ time_period_class(path) for path in input_dir.rglob(glob_pattern)
+ ]
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ for timestamped_object in tqdm(timestamped_objects,
+ desc="Categorize by timestamp",
+ unit="file"):
+ try:
+ timestamp = timestamped_object.categorize()
+ timestamp_file_name = output_dir / f"{OUTPUT_FILE_NAME}_{timestamp}.csv"
+
+ if timestamp_file_name.exists():
+ df = pd.read_csv(timestamp_file_name)
+ else:
+ df = pd.DataFrame(columns=[FILENAME_COLUMN, ARTICLE_ID_COLUMN])
+
+ new_row = {
+ FILENAME_COLUMN: str(timestamped_object.data()[FILENAME_COLUMN]),
+ ARTICLE_ID_COLUMN: str(timestamped_object.data()[ARTICLE_ID_COLUMN]),
+ }
+ df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
+
+ df.to_csv(timestamp_file_name, index=False)
+
+ except Exception as e: # pylint: disable=broad-except
+ logging.error("Error processing timestamped object: %s", str(e))
+
+
+def update_selected_indices_in_file(filepath: str,
+ indices_selected: List[int]) -> None:
+ """
+ Update selected indices in a CSV file.
+
+ Args:
+ filepath (str): The path to the CSV file.
+ indices_selected (List[int]): A list of indices to be marked
+ as selected.
+
+ Raises:
+ ValueError: If indices_selected is empty or contains
+ non-negative integers.
+
+ """
+ try:
+ if indices_selected and all(isinstance(idx, int) and idx >= 0
+ for idx in indices_selected):
+ df = pd.read_csv(filepath)
+ df['selected'] = 0
+ df.loc[indices_selected, 'selected'] = 1
+ df.to_csv(filepath, index=False)
+ else:
+ raise ValueError("Invalid indices_selected")
+ except Exception as e: # pylint: disable=W0718
+ logging.error("Error updating selected indices in file: %s",
+ e)
+
+
+def select_final_articles(
+ input_dir: Path,
+ glob_pattern: str,
+ config_path: Path,
+):
+ """
+ Core functionality to select final articles based on keywords and configuration.
+
+ Args:
+ input_dir (Path): Directory containing input files.
+ glob_pattern (str): Glob pattern to match input files (e.g., '*.csv').
+ config_path (Path): Path to the configuration file.
+ """
+ if not input_dir.is_dir():
+ raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
+
+ keywords = get_keywords_from_config(config_path)
+ config_article_selector = read_config(config_path, ARTICLE_SELECTOR_FIELD)
+
+ if len(keywords) > 0 and config_article_selector:
+ for articles_filepath in tqdm(
+ input_dir.rglob(glob_pattern),
+ desc="Processing articles",
+ unit="file",
+ ):
+ try:
+ selected_indices = select_articles(
+ str(articles_filepath), keywords, config_article_selector
+ )
+
+ update_selected_indices_in_file(str(articles_filepath), selected_indices)
+ except Exception as e: # pylint: disable=broad-except
+ logging.error("Error processing file %s: %s", articles_filepath, str(e))
+
+
+def cli():
+ """
+ Command-line interface for filter articles.
+ """
+ parser = argparse.ArgumentParser("Filter articles from input files.")
+
+ parser.add_argument(
+ "--input-dir",
+ type=Path,
+ help="Base directory for reading input files. ",
+ )
+ parser.add_argument(
+ "--glob",
+ type=str,
+ required=True,
+ help="Glob pattern for find input files; e.g. '*.gz' ",
+ )
+ parser.add_argument(
+ "--config-path",
+ type=Path,
+ default="config.json",
+ help="File path of config file.",
+ )
+ parser.add_argument(
+ "--input-type",
+ type=str,
+ required=True,
+ choices=list(INPUT_FILE_TYPES.keys()),
+ help="Input file format.",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=Path,
+ help="The directory for storing output files.",
+ )
+ parser.add_argument(
+ "--period-type",
+ type=str,
+ required=True,
+ choices=list(PERIOD_TYPES.keys()),
+ help="Time periods",
+ )
+ args = parser.parse_args()
+
+ try:
+ filter_articles(
+ input_dir=args.input_dir,
+ glob_pattern=args.glob,
+ config_path=args.config_path,
+ input_type=args.input_type,
+ output_dir=args.output_dir / "output_filter",
+ )
+ categorize_articles(
+ input_dir=args.output_dir / "output_filter",
+ period_type=args.period_type,
+ glob_pattern="*.json",
+ output_dir=args.output_dir / "output_timestamped",
+ )
+
+ select_final_articles(
+ input_dir=args.output_dir / "output_timestamped",
+ glob_pattern="*.csv",
+ config_path=args.config_path,
+ )
+
+ except ValueError as e:
+ parser.error(str(e))
+ except Exception as e: # pylint: disable=broad-except
+ logging.error("Error occurred in CLI: %s", str(e))
+
+
+if __name__ == "__main__":
+ cli()