From 3e1a9c85a9e024f8bbe2922c8d1a572ae2a0234f Mon Sep 17 00:00:00 2001
From: parisa-zahedi <p.zahedi@uu.nl>
Date: Fri, 10 Jan 2025 13:30:17 +0100
Subject: [PATCH 1/2] merge multiple scripts in one

---
 README.md                                     | 141 ++++++++----------
 config.json                                   |  33 ++--
 pyproject.toml                                |  11 +-
 ..._generate_output.py => generate_output.py} |   0
 scripts/step1_filter_articles.py              | 101 -------------
 scripts/step2_categorize_by_timestamp.py      | 117 ---------------
 scripts/step3_select_final_articles.py        | 119 ---------------
 7 files changed, 95 insertions(+), 427 deletions(-)
 rename scripts/{step4_generate_output.py => generate_output.py} (100%)
 delete mode 100644 scripts/step1_filter_articles.py
 delete mode 100644 scripts/step2_categorize_by_timestamp.py
 delete mode 100644 scripts/step3_select_final_articles.py

diff --git a/README.md b/README.md
index a32351b..495938c 100644
--- a/README.md
+++ b/README.md
@@ -20,30 +20,11 @@ To install and run this project you need to have the following prerequisites ins
 ```
 
 ### Installation
-#### Option 1 - Install dataQuest package
 To run the project, ensure to install the dataQuest package that is part of this project.
 ```
 pip install dataQuest
 ```
-#### Option 2 - Run from source code
-If you want to run the scripts without installation you need to:  
 
-- Install requirement
-```commandline
-pip install setuptools wheel
-python -m pip install build
-```
-Change your current working directory to the location of your pyproject.toml file.
-```
-python -m build
-pip install .
-```
-- Set PYTHONPATH environment: 
-On Linux and Mac OS, you might have to set the PYTHONPATH environment variable to point to this directory.
-
-```commandline
-export PYTHONPATH="current working directory/dataQuest:${PYTHONPATH}"
-```
 ### Built with
 These packages are automatically installed in the step above:
 * [scikit-learn](https://scikit-learn.org/stable/)
@@ -93,92 +74,96 @@ extractor.extract_xml_string()
 
 Navigate to scripts folder and run:
 ```
-python3 convert_input_files.py --input_dir path/to/raw/xml/data --output_dir path/to/converted/json/compressed/output
+python3 convert_input_files.py 
+   --input_dir path/to/raw/xml/data 
+   --output_dir path/to/converted/json/compressed/output
 ```
 #### Customize input-file
 
-In order to define a corpus with a new data format you should:
+In order to add a new corpus to dataQuest you should:
 
+- prepare your input data in the JSON format explained above.
 - add a new input_file_type to [INPUT_FILE_TYPES](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/__init__.py)
 - implement a class that inherits from [input_file.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/input_file.py).
 This class is customized to read a new data format. In our case-study we defined [delpher_kranten.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/delpher_kranten.py).
 
 
-### 2. Filtering
-In this step, you may select articles based on a filter or a collection of filters. Articles can be filtered by title, year, decade, or a set of keywords defined in the ```config.json``` file.
+### 2. Filter articles
+You can select articles based on a single filter or a combination of filters. Articles can be filtered by title, year, 
+decade, or a set of keywords defined in the ```config.json``` file. Logical operators such as AND, OR, and NOT can be used to combine filtering expressions.
+
+In the following example, you select articles that include any of the specified keywords AND were published between 1800 and 1910 AND do not 
+contain advertisements (e.g., "Advertentie").
 ```commandline
  "filters": [
         {
             "type": "AndFilter",
-            "filters": [
-                {
-                    "type": "OrFilter",
-                    "filters": [
+                "filters": [
                         {
                             "type": "YearFilter",
                             "start_year": 1800,
                             "end_year": 1910
                         },
                         {
-                            "type": "DecadeFilter",
-                            "decade": 1960
+                            "type": "NotFilter",
+                            "filter": {
+                                "type": "ArticleTitleFilter",
+                                "article_title": "Advertentie"
+                            },
+                            "level": "article"
+                        },
+                        {
+                            "type": "KeywordsFilter",
+                            "keywords": ["sustainability", "green"]
                         }
-                    ]
-                },
-                {
-                    "type": "NotFilter",
-                    "filter": {
-                        "type": "ArticleTitleFilter",
-                        "article_title": "Advertentie"
-                    },
-                    "level": "article"
-                },
-                {
-                    "type": "KeywordsFilter",
-                    "keywords": ["sustainability", "green"]
-                }
-            ]
+                ]
         }
-    ]
+ ],
 
 ```
-run the following to filter the articles:
-```commandline
-python3 scripts/step1_filter_articles.py --input-dir "path/to/converted/json/compressed/output/" --output-dir "output_filter/" --input-type "delpher_kranten" --glob "*.gz"
-```
-In our case, input-type is "delpher_kranten", and input data is a set of compresed json files with ```.gz``` extension.
+To select the most relevant articles:
+1. articles are selected based the filters in the config file 
 
-The output of this script is a JSON file for each selected article in the following format:
-```commandline
-{
-    "file_path": "output/transfered_data/00/KRANTEN_KBPERS01_000002100.json.gz",
-    "article_id": "5",
-    "Date": "1878-04-29",
-    "Title": "Opregte Haarlemsche Courant"
-}
-```
-### 3. Categorization by timestamp
-The output files generated in the previous step are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/temporal_categorization/__init__.py), 
-such as ```year``` or ```decade```. This categorization is essential for subsequent steps, especially if you intend to apply tf-idf or other models to specific periods. In our case, we applied tf-idf per decade.
+
+2. selected articles are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/temporal_categorization/__init__.py), 
+such as ```year``` or ```decade```. This categorization is essential for subsequent steps, especially in case of applying tf-idf or other models to specific periods.
+
+
+3. Select the most relevant articles related to the specified topic (defined by the provided keywords).
+   3.1. Select articles that contain any of the specified keywords in their title.
+   
+   3.2. Utilize TF-IDF (the default model), which can be extended to other models.
 
 ```commandline
-python3 scripts/step2_categorize_by_timestamp.py --input-dir "output_filter/" --glob "*.json" --period-type "decade"  --output-dir "output_timestamped/"
+python3 scripts/filter_articles.py 
 
+    --input-dir "path/to/converted/json/compressed/" 
+    
+    --output-dir "output/" 
+    
+    --input-type "delpher_kranten" 
+    
+    --glob "*.gz"
+    
+    --period-type "decade"
 ```
-The output consists of a .csv file for each period, such as one file per decade, containing the ```file_path``` and ```article_id``` of selected articles.
+In our case:
+- The input data consists of compressed JSON files with the .gz extension. 
+- The input type is "delpher_kranten". 
+- Selected articles are categorized by decade.
 
-### 4. Select final articles
-This step is applicable when articles are filtered (in step 2) using a set of keywords. 
-By utilizing tf-idf, the most relevant articles related to the specified topic (defined by the provided keywords) are selected.
 
-Before applying tf-idf, articles containing any of the specified keywords in their title are selected.
+#### Output
+The output consists of a .csv file for each period, such as one file per decade. Each file contains the ```file_path``` and ```article_id``` of the filtered articles, 
+along with an additional column, ```selected```, which indicates the articles labeled as the most relevant by the model (e.g., TF-IDF).
 
-From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json):
+There are different strategies for selecting the final articles. You should specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json):
 
-- Percentage of selected articles with the top scores
-- Maximum number of selected articles with the top scores 
-- Threshold for the value of cosine similarity between the embeddings of list of keywords and each article.
+- Percentage: Select a percentage of articles with the highest scores.
 
+- Maximum Number: Specify the maximum number of articles to select based on their scores.
+
+- Threshold: Set a threshold for the cosine similarity value between the embeddings of the keyword list and each article.
 
 ```commandline
   "article_selector":
@@ -204,12 +189,8 @@ From the rest of articles, to choose the most relevant ones, you can specify one
     }, 
 ```
 
-The following script, add a new column, ```selected``` to the .csv files from the previous step.
-```commandline
-python3 scripts/step3_select_final_articles.py --input-dir "output/output_timestamped/"
-```
 
-### 5. Generate output
+### 3. Generate output
 As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text.
 This feature can be set in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json).
 ```commandline
@@ -225,7 +206,10 @@ OR
 ```
 
 ```commandline
-python3 scripts/step4_generate_output.py --input-dir "output/output_timestamped/” --output-dir “output/output_results/“  --glob “*.csv”
+python3 scripts/generate_output.py 
+--input-dir "output/output_timestamped/” 
+--output-dir “output/output_results/“  
+--glob “*.csv”
 ```
 ## About the Project
 **Date**: February 2024
@@ -239,7 +223,6 @@ Pim Huijnen (p.huijnen@uu.nl)
 
 - Parisa Zahedi (p.zahedi@uu.nl)
 - Shiva Nadi (s.nadi@uu.nl)
-- Matty Vermet (m.s.vermet@uu.nl)
 
 
 ### License
diff --git a/config.json b/config.json
index 84f6a9a..7fb7e97 100644
--- a/config.json
+++ b/config.json
@@ -1,13 +1,28 @@
 {
-  "filters": [
-    {
-      "type": "KeywordsFilter",
-      "keywords":  ["windkracht", "windenergie", "windenergiebranche", "windturbine", "windstroom",
-  "zonne-energie", "zonnewarmte", "zonnestraling", "geothermische energie", "aardwarmte",
-  "waterkracht", "waterkrachtcentrale", "waterkrachtwerken", "waterstof", "waterstofenergie",
-  "hydroturbine", "getijden-energie", "ethanol-produktie", "ethanol"]
-    }
-  ],
+ "filters": [
+        {
+            "type": "AndFilter",
+                "filters": [
+                        {
+                            "type": "YearFilter",
+                            "start_year": 1800,
+                            "end_year": 1910
+                        },
+                        {
+                            "type": "NotFilter",
+                            "filter": {
+                                "type": "ArticleTitleFilter",
+                                "article_title": "Advertentie"
+                            },
+                            "level": "article"
+                        },
+                        {
+                            "type": "KeywordsFilter",
+                            "keywords": ["sustainability", "green"]
+                        }
+                ]
+        }
+ ],
   "article_selector":
     {
       "type": "percentage",
diff --git a/pyproject.toml b/pyproject.toml
index 7ef3a03..2420ec4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,9 +6,8 @@ build-backend = "setuptools.build_meta"
 name = "dataQuest"
 description = "A package to extract hystorical news sentiments"
 authors = [
-    {name = "Shiva Nadi", email = "s.nadi@uu.nl"},
     {name = "Parisa Zahedi", email = "p.zahedi@uu.nl"},
-    {name = "Matty Vermet", email = "m.s.vermet@uu.nl"}
+    {name = "Shiva Nadi", email = "s.nadi@uu.nl"}
 ]
 readme = "README.md"
 requires-python = ">=3.8"
@@ -52,3 +51,11 @@ module = [
     "sklearn.*"
 ]
 ignore_missing_imports = true
+
+[project.scripts]
+filter-articles = "dataQuest.filter_articles:cli"
+generate-output = "dataQuest.generate_output:cli"
+
+[tool.setuptools.package-data]
+"dataQuest" = ["config.json"]
+
diff --git a/scripts/step4_generate_output.py b/scripts/generate_output.py
similarity index 100%
rename from scripts/step4_generate_output.py
rename to scripts/generate_output.py
diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py
deleted file mode 100644
index 4e67b7a..0000000
--- a/scripts/step1_filter_articles.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-This script filter articles from input files according to
-specified configurations.
-"""
-
-import argparse
-from pathlib import Path
-from typing import Iterable
-from tqdm import tqdm
-from dataQuest.filter import INPUT_FILE_TYPES
-from dataQuest.filter.input_file import InputFile
-from dataQuest.utils import load_filters_from_config
-from dataQuest.utils import save_filtered_articles
-
-
-def filter_articles(
-    input_dir: Path,
-    glob_pattern: str,
-    config_path: Path,
-    input_type: str,
-    output_dir: Path,
-):
-    """
-    Core functionality to process files, filter articles, and save results.
-
-    Args:
-        input_dir (Path): Directory containing input files.
-        glob_pattern (str): Glob pattern to match input files.
-        config_path (Path): Path to the configuration file.
-        input_type (str): File format of the input files.
-        output_dir (Path): Directory to save filtered articles.
-    """
-    if not input_dir.is_dir():
-        raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
-
-    input_file_class = INPUT_FILE_TYPES[input_type]
-    input_files: Iterable[InputFile] = [
-        input_file_class(path) for path in input_dir.rglob(glob_pattern)
-    ]
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    compound_filter = load_filters_from_config(config_path)
-
-    for input_file in tqdm(input_files, desc="Filtering articles", unit="file"):
-        for article in input_file.selected_articles(compound_filter):
-            save_filtered_articles(input_file, article.id, output_dir)
-
-
-def cli():
-    """
-        Command-line interface for filter articles.
-    """
-    parser = argparse.ArgumentParser("Filter articles from input files.")
-
-    parser.add_argument(
-        "--input-dir",
-        type=Path,
-        help="Base directory for reading input files. ",
-    )
-    parser.add_argument(
-        "--glob",
-        type=str,
-        required=True,
-        help="Glob pattern for find input files; e.g. '*.gz' ",
-    )
-    parser.add_argument(
-        "--config-path",
-        type=Path,
-        default="config.json",
-        help="File path of config file.",
-    )
-    parser.add_argument(
-        "--input-type",
-        type=str,
-        required=True,
-        choices=list(INPUT_FILE_TYPES.keys()),
-        help="Input file format.",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        help="The directory for storing output files.",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        filter_articles(
-            input_dir=args.input_dir,
-            glob_pattern=args.glob,
-            config_path=args.config_path,
-            input_type=args.input_type,
-            output_dir=args.output_dir,
-        )
-    except ValueError as e:
-        parser.error(str(e))
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/scripts/step2_categorize_by_timestamp.py b/scripts/step2_categorize_by_timestamp.py
deleted file mode 100644
index a77eac8..0000000
--- a/scripts/step2_categorize_by_timestamp.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-This script defines functions and classes to categorize files based
-on their timestamps.
-"""
-import argparse
-import logging
-from typing import Iterable
-from pathlib import Path
-import pandas as pd
-from tqdm import tqdm  # type: ignore
-from dataQuest.temporal_categorization import PERIOD_TYPES
-from dataQuest.temporal_categorization.timestamped_data import TimestampedData
-
-OUTPUT_FILE_NAME = 'articles'
-FILENAME_COLUMN = 'file_path'
-ARTICLE_ID_COLUMN = 'article_id'
-
-
-def categorize_articles(
-    input_dir: Path,
-    period_type: str,
-    glob_pattern: str,
-    output_dir: Path,
-):
-    """
-    Core functionality to categorize articles by timestamp.
-
-    Args:
-        input_dir (Path): Directory containing input files.
-        period_type (str): Type of time period to use for categorization.
-        glob_pattern (str): Glob pattern to find input files (e.g., '*.json').
-        output_dir (Path): Directory to save categorized files.
-    """
-    if not input_dir.is_dir():
-        raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
-
-    time_period_class = PERIOD_TYPES[period_type]
-    timestamped_objects: Iterable[TimestampedData] = [
-        time_period_class(path) for path in input_dir.rglob(glob_pattern)
-    ]
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    for timestamped_object in tqdm(timestamped_objects,
-                                   desc="Categorize by timestamp",
-                                   unit="file"):
-        try:
-            timestamp = timestamped_object.categorize()
-            timestamp_file_name = output_dir / f"{OUTPUT_FILE_NAME}_{timestamp}.csv"
-
-            if timestamp_file_name.exists():
-                df = pd.read_csv(timestamp_file_name)
-            else:
-                df = pd.DataFrame(columns=[FILENAME_COLUMN, ARTICLE_ID_COLUMN])
-
-            new_row = {
-                FILENAME_COLUMN: str(timestamped_object.data()[FILENAME_COLUMN]),
-                ARTICLE_ID_COLUMN: str(timestamped_object.data()[ARTICLE_ID_COLUMN]),
-            }
-            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
-
-            df.to_csv(timestamp_file_name, index=False)
-
-        except Exception as e:  # pylint: disable=broad-except
-            logging.error("Error processing timestamped object: %s", str(e))
-
-
-def cli():
-    """
-        Command-line interface for categorize articles by timestamp.
-    """
-    parser = argparse.ArgumentParser("Categorize articles by timestamp.")
-
-    parser.add_argument(
-        "--input-dir",
-        type=Path,
-        required=True,
-        help="Base directory for reading input files.",
-    )
-    parser.add_argument(
-        "--period-type",
-        type=str,
-        required=True,
-        choices=list(PERIOD_TYPES.keys()),
-        help="Time periods",
-    )
-    parser.add_argument(
-        "--glob",
-        type=str,
-        required=True,
-        default="*.json",
-        help="Glob pattern for find input files; e.g. '*.json'.",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        required=True,
-        help="The directory for storing output files.",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        categorize_articles(
-            input_dir=args.input_dir,
-            period_type=args.period_type,
-            glob_pattern=args.glob,
-            output_dir=args.output_dir,
-        )
-    except ValueError as e:
-        parser.error(str(e))
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error("Error occurred in CLI: %s", str(e))
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/scripts/step3_select_final_articles.py b/scripts/step3_select_final_articles.py
deleted file mode 100644
index 969dc22..0000000
--- a/scripts/step3_select_final_articles.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Select final articles."""
-import argparse
-import logging
-from typing import List
-from pathlib import Path
-import pandas as pd
-from tqdm import tqdm  # type: ignore
-from dataQuest.utils import get_keywords_from_config
-from dataQuest.utils import read_config
-from dataQuest.article_final_selection.process_articles import select_articles
-
-ARTICLE_SELECTOR_FIELD = "article_selector"
-
-
-def update_selected_indices_in_file(filepath: str,
-                                    indices_selected: List[int]) -> None:
-    """
-    Update selected indices in a CSV file.
-
-    Args:
-        filepath (str): The path to the CSV file.
-        indices_selected (List[int]): A list of indices to be marked
-        as selected.
-
-    Raises:
-        ValueError: If indices_selected is empty or contains
-        non-negative integers.
-
-    """
-    try:
-        if indices_selected and all(isinstance(idx, int) and idx >= 0
-                                    for idx in indices_selected):
-            df = pd.read_csv(filepath)
-            df['selected'] = 0
-            df.loc[indices_selected, 'selected'] = 1
-            df.to_csv(filepath, index=False)
-        else:
-            raise ValueError("Invalid indices_selected")
-    except Exception as e:  # pylint: disable=W0718
-        logging.error("Error updating selected indices in file: %s",
-                      e)
-
-
-def select_final_articles(
-    input_dir: Path,
-    glob_pattern: str,
-    config_path: Path,
-):
-    """
-    Core functionality to select final articles based on keywords and configuration.
-
-    Args:
-        input_dir (Path): Directory containing input files.
-        glob_pattern (str): Glob pattern to match input files (e.g., '*.csv').
-        config_path (Path): Path to the configuration file.
-    """
-    if not input_dir.is_dir():
-        raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
-
-    keywords = get_keywords_from_config(config_path)
-    config_article_selector = read_config(config_path, ARTICLE_SELECTOR_FIELD)
-
-    if len(keywords) > 0 and config_article_selector:
-        for articles_filepath in tqdm(
-            input_dir.rglob(glob_pattern),
-            desc="Processing articles",
-            unit="file",
-        ):
-            try:
-                selected_indices = select_articles(
-                    str(articles_filepath), keywords, config_article_selector
-                )
-
-                update_selected_indices_in_file(str(articles_filepath), selected_indices)
-            except Exception as e:  # pylint: disable=broad-except
-                logging.error("Error processing file %s: %s", articles_filepath, str(e))
-
-
-def cli():
-    """
-        Command-line interface for selecting final articles.
-    """
-    parser = argparse.ArgumentParser("Select final articles.")
-
-    parser.add_argument(
-        "--input-dir",
-        type=Path,
-        required=True,
-        help="Base directory for reading input files.",
-    )
-    parser.add_argument(
-        "--glob",
-        type=str,
-        default="*.csv",
-        help="Glob pattern for find input files; e.g. '*.csv'.",
-    )
-    parser.add_argument(
-        "--config-path",
-        type=Path,
-        default="config.json",
-        help="File path of config file.",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        select_final_articles(
-            input_dir=args.input_dir,
-            glob_pattern=args.glob,
-            config_path=args.config_path,
-        )
-    except ValueError as e:
-        parser.error(str(e))
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error("Error occurred in CLI: %s", str(e))
-
-
-if __name__ == "__main__":
-    cli()

From 66d909054c0a050b8b521281be22d7c62a2284b8 Mon Sep 17 00:00:00 2001
From: parisa-zahedi <p.zahedi@uu.nl>
Date: Fri, 10 Jan 2025 13:31:44 +0100
Subject: [PATCH 2/2] merge three scripts in one

---
 scripts/filter_articles.py | 248 +++++++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 scripts/filter_articles.py

diff --git a/scripts/filter_articles.py b/scripts/filter_articles.py
new file mode 100644
index 0000000..4c8c13f
--- /dev/null
+++ b/scripts/filter_articles.py
@@ -0,0 +1,248 @@
+"""
+This script filter articles from input files according to
+specified configurations.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Iterable, List
+import pandas as pd
+from tqdm import tqdm
+
+from dataQuest.filter import INPUT_FILE_TYPES
+from dataQuest.filter.input_file import InputFile
+from dataQuest.utils import load_filters_from_config
+from dataQuest.utils import save_filtered_articles
+from dataQuest.temporal_categorization import PERIOD_TYPES
+from dataQuest.temporal_categorization.timestamped_data import TimestampedData
+from dataQuest.utils import get_keywords_from_config
+from dataQuest.utils import read_config
+from dataQuest.article_final_selection.process_articles import select_articles
+
+ARTICLE_SELECTOR_FIELD = "article_selector"
+OUTPUT_FILE_NAME = 'articles'
+FILENAME_COLUMN = 'file_path'
+ARTICLE_ID_COLUMN = 'article_id'
+
+
+def filter_articles(
+    input_dir: Path,
+    glob_pattern: str,
+    config_path: Path,
+    input_type: str,
+    output_dir: Path,
+):
+    """
+    Core functionality to process files, filter articles, and save results.
+
+    Args:
+        input_dir (Path): Directory containing input files.
+        glob_pattern (str): Glob pattern to match input files.
+        config_path (Path): Path to the configuration file.
+        input_type (str): File format of the input files.
+        output_dir (Path): Directory to save filtered articles.
+    """
+    if not input_dir.is_dir():
+        raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
+
+    input_file_class = INPUT_FILE_TYPES[input_type]
+    input_files: Iterable[InputFile] = [
+        input_file_class(path) for path in input_dir.rglob(glob_pattern)
+    ]
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    compound_filter = load_filters_from_config(config_path)
+
+    for input_file in tqdm(input_files, desc="Filtering articles", unit="file"):
+        for article in input_file.selected_articles(compound_filter):
+            save_filtered_articles(input_file, article.id, output_dir)
+
+
+def categorize_articles(
+    input_dir: Path,
+    period_type: str,
+    glob_pattern: str,
+    output_dir: Path,
+):
+    """
+    Core functionality to categorize articles by timestamp.
+
+    Args:
+        input_dir (Path): Directory containing input files.
+        period_type (str): Type of time period to use for categorization.
+        glob_pattern (str): Glob pattern to find input files (e.g., '*.json').
+        output_dir (Path): Directory to save categorized files.
+    """
+    if not input_dir.is_dir():
+        raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
+
+    time_period_class = PERIOD_TYPES[period_type]
+    timestamped_objects: Iterable[TimestampedData] = [
+        time_period_class(path) for path in input_dir.rglob(glob_pattern)
+    ]
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for timestamped_object in tqdm(timestamped_objects,
+                                   desc="Categorize by timestamp",
+                                   unit="file"):
+        try:
+            timestamp = timestamped_object.categorize()
+            timestamp_file_name = output_dir / f"{OUTPUT_FILE_NAME}_{timestamp}.csv"
+
+            if timestamp_file_name.exists():
+                df = pd.read_csv(timestamp_file_name)
+            else:
+                df = pd.DataFrame(columns=[FILENAME_COLUMN, ARTICLE_ID_COLUMN])
+
+            new_row = {
+                FILENAME_COLUMN: str(timestamped_object.data()[FILENAME_COLUMN]),
+                ARTICLE_ID_COLUMN: str(timestamped_object.data()[ARTICLE_ID_COLUMN]),
+            }
+            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
+
+            df.to_csv(timestamp_file_name, index=False)
+
+        except Exception as e:  # pylint: disable=broad-except
+            logging.error("Error processing timestamped object: %s", str(e))
+
+
+def update_selected_indices_in_file(filepath: str,
+                                    indices_selected: List[int]) -> None:
+    """
+    Update selected indices in a CSV file.
+
+    Args:
+        filepath (str): The path to the CSV file.
+        indices_selected (List[int]): A list of indices to be marked
+        as selected.
+
+    Raises:
+        ValueError: If indices_selected is empty or contains
+        non-negative integers.
+
+    """
+    try:
+        if indices_selected and all(isinstance(idx, int) and idx >= 0
+                                    for idx in indices_selected):
+            df = pd.read_csv(filepath)
+            df['selected'] = 0
+            df.loc[indices_selected, 'selected'] = 1
+            df.to_csv(filepath, index=False)
+        else:
+            raise ValueError("Invalid indices_selected")
+    except Exception as e:  # pylint: disable=W0718
+        logging.error("Error updating selected indices in file: %s",
+                      e)
+
+
+def select_final_articles(
+    input_dir: Path,
+    glob_pattern: str,
+    config_path: Path,
+):
+    """
+    Core functionality to select final articles based on keywords and configuration.
+
+    Args:
+        input_dir (Path): Directory containing input files.
+        glob_pattern (str): Glob pattern to match input files (e.g., '*.csv').
+        config_path (Path): Path to the configuration file.
+    """
+    if not input_dir.is_dir():
+        raise ValueError(f"Not a directory: '{str(input_dir.absolute())}'")
+
+    keywords = get_keywords_from_config(config_path)
+    config_article_selector = read_config(config_path, ARTICLE_SELECTOR_FIELD)
+
+    if len(keywords) > 0 and config_article_selector:
+        for articles_filepath in tqdm(
+            input_dir.rglob(glob_pattern),
+            desc="Processing articles",
+            unit="file",
+        ):
+            try:
+                selected_indices = select_articles(
+                    str(articles_filepath), keywords, config_article_selector
+                )
+
+                update_selected_indices_in_file(str(articles_filepath), selected_indices)
+            except Exception as e:  # pylint: disable=broad-except
+                logging.error("Error processing file %s: %s", articles_filepath, str(e))
+
+
+def cli():
+    """
+        Command-line interface for filter articles.
+    """
+    parser = argparse.ArgumentParser("Filter articles from input files.")
+
+    parser.add_argument(
+        "--input-dir",
+        type=Path,
+        help="Base directory for reading input files. ",
+    )
+    parser.add_argument(
+        "--glob",
+        type=str,
+        required=True,
+        help="Glob pattern for find input files; e.g. '*.gz' ",
+    )
+    parser.add_argument(
+        "--config-path",
+        type=Path,
+        default="config.json",
+        help="File path of config file.",
+    )
+    parser.add_argument(
+        "--input-type",
+        type=str,
+        required=True,
+        choices=list(INPUT_FILE_TYPES.keys()),
+        help="Input file format.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        help="The directory for storing output files.",
+    )
+    parser.add_argument(
+        "--period-type",
+        type=str,
+        required=True,
+        choices=list(PERIOD_TYPES.keys()),
+        help="Time periods",
+    )
+    args = parser.parse_args()
+
+    try:
+        filter_articles(
+            input_dir=args.input_dir,
+            glob_pattern=args.glob,
+            config_path=args.config_path,
+            input_type=args.input_type,
+            output_dir=args.output_dir / "output_filter",
+        )
+        categorize_articles(
+            input_dir=args.output_dir / "output_filter",
+            period_type=args.period_type,
+            glob_pattern="*.json",
+            output_dir=args.output_dir / "output_timestamped",
+        )
+
+        select_final_articles(
+            input_dir=args.output_dir / "output_timestamped",
+            glob_pattern="*.csv",
+            config_path=args.config_path,
+        )
+
+    except ValueError as e:
+        parser.error(str(e))
+    except Exception as e:  # pylint: disable=broad-except
+        logging.error("Error occurred in CLI: %s", str(e))
+
+
+if __name__ == "__main__":
+    cli()