Web Scraper for Articles

bai-rongzhuo · bai-rongzhuo · commit 7223af099fa9 · 2025-10-31T22:31:59.000-04:00
diff --git a/Python/web_scraper/README.md b/Python/web_scraper/README.md
@@ -0,0 +1,121 @@
+# Web Scraper
+
+A Python command-line tool for scraping news articles from websites using the `newspaper3k` library. The tool can extract individual articles or all articles from a news website and export them to JSON or CSV format.
+
+## Features
+
+- **Single Article Scraping**: Extract content from a specific article URL
+- **Bulk Article Scraping**: Scrape all articles linked from a news website homepage
+- **Multiple Export Formats**: Export data as JSON or CSV
+- **Custom File Names**: Specify custom output file names
+- **Article Metadata**: Extract title, authors, publication date, content, and URL
+
+## Installation
+
+1. Ensure you have Python 3.6+ installed
+2. Install the required dependencies:
+
+```bash
+pip install newspaper3k
+```
+
+## Usage
+
+### Basic Single Article Scraping
+
+```bash
+python web_scraper.py "https://example.com/news-article"
+```
+
+This will create a `news.json` file with the scraped article data.
+
+### Scrape All Articles from a News Site
+
+```bash
+python web_scraper.py "https://example-news.com" --all-articles
+```
+
+### Export to CSV Format
+
+```bash
+python web_scraper.py "https://example.com/article" --csv-format
+```
+
+### Custom Output File Name
+
+```bash
+python web_scraper.py "https://example.com/article" --file my_articles
+```
+
+### Combine Options
+
+```bash
+# Scrape all articles and export as CSV with custom filename
+python web_scraper.py "https://example-news.com" -a -csv -f my_data
+```
+
+## Command Line Arguments
+
+| Argument | Short | Description | Default |
+|----------|-------|-------------|---------|
+| `url` | - | URL of the webpage to scrape (required) | - |
+| `--file` | `-f` | Custom output filename | `news` |
+| `--csv-format` | `-csv` | Export to CSV instead of JSON | `False` |
+| `--all-articles` | `-a` | Scrape all articles from the site | `False` |
+
+## Output Format
+
+### JSON Output
+```json
+[
+  {
+    "title": "Article Title",
+    "authors": ["Author One", "Author Two"],
+    "publish_date": "2023-10-15 14:30:00",
+    "text": "Full article content...",
+    "url": "https://example.com/article"
+  }
+]
+```
+
+### CSV Output
+The CSV file will contain columns for:
+- `title`
+- `authors` (as a string representation of the list)
+- `publish_date`
+- `text`
+- `url`
+
+## Examples
+
+1. **Scrape a single article to JSON:**
+   ```bash
+   python web_scraper.py "https://www.bbc.com/news/world-us-canada-12345678"
+   ```
+
+2. **Scrape all articles from CNN and export as CSV:**
+   ```bash
+   python web_scraper.py "https://www.cnn.com" -a -csv -f cnn_articles
+   ```
+
+3. **Scrape with custom JSON filename:**
+   ```bash
+   python web_scraper.py "https://example.com/article" -f my_article_data
+   ```
+
+## Notes
+
+- The tool uses the `newspaper3k` library which may not work with all websites, especially those with heavy JavaScript rendering or anti-scraping measures
+- Some news sites may block automated scraping attempts
+- The quality of extracted content depends on the website's structure and the `newspaper3k` library's parsing capabilities
+- For sites with many articles, using `--all-articles` may take considerable time
+
+## Error Handling
+
+- If scraping fails, the tool will display an error message
+- Empty results will be indicated with appropriate messages
+- Network issues and parsing errors are caught and reported
+
+## License
+
+This tool is provided for educational and personal use. Please respect website terms of service and robots.txt files when scraping.
diff --git a/Python/web_scraper/web_scraper.py b/Python/web_scraper/web_scraper.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python3
+
+import csv
+import newspaper
+import argparse
+import json
+from datetime import datetime
+
+class WebScraper:
+    def __init__(self, url, file_name='news', export_format='json'):
+        self.url = url
+        
+        if export_format not in ['json', 'csv']:
+            raise ValueError('Export format must be either json or csv.')
+        
+        self.export_format = export_format
+        
+        if export_format == 'json' and not file_name.endswith('.json'):
+            self.FILE_NAME = file_name + '.json'
+        elif export_format == 'csv' and not file_name.endswith('.csv'):
+            self.FILE_NAME = file_name + '.csv'
+        else:
+            self.FILE_NAME = file_name
+
+    def export_to_JSON(self, articles):
+        with open(self.FILE_NAME, 'w') as f:
+            articles_dict = [article for article in articles]
+            json.dump(articles_dict, f, indent=2)
+
+    def export_to_CSV(self, articles):
+        with open(self.FILE_NAME, 'w', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=['title', 'authors', 'publish_date', 'text', 'url'])
+            writer.writeheader()
+            for article in articles:
+                writer.writerow(article)
+
+    def get_one_article(self, url=None):
+        target_url = url or self.url
+        try:
+            article = newspaper.Article(target_url)
+            article.download()
+            article.parse()
+            summary = {
+                'title': article.title or "No title found",
+                'authors': article.authors or ["Unknown author"],
+                'publish_date': article.publish_date.strftime('%Y-%m-%d %H:%M:%S') if article.publish_date else None,
+                'text': article.text or "No content found",
+                'url': target_url
+            }
+            return summary
+        
+        except Exception as e:
+            print(f'Error scraping {target_url}: {e}')
+            return None
+
+    def get_all_articles(self):
+        try:
+            summaries = []
+            paper = newspaper.build(self.url, memoize_articles=False)
+            for art in paper.articles:
+                summary = self.get_one_article(art.url)
+                if summary:
+                    summaries.append(summary)
+            return summaries
+        
+        except Exception as e:
+            print(f'Error building newspaper from {self.url}: {e}')
+            return []
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Web Scraper for News')
+    parser.add_argument('url', help='URL of the webpage to scrape')
+    parser.add_argument('--file', '-f', default='news', 
+                       help='Custom output file (default: news.json or news.csv)')
+    parser.add_argument('--csv-format', '-csv', action='store_true',
+                       help='Export to CSV format instead of JSON format')
+    parser.add_argument('--all-articles', '-a', action='store_true',
+                       help='Get all articles linked to URL instead of only the article from the URL itself')
+    
+    args = parser.parse_args()
+    
+    export_format = 'csv' if args.csv_format else 'json'
+    
+    try:
+        web_scraper = WebScraper(
+            url=args.url, 
+            file_name=args.file, 
+            export_format=export_format
+        )
+        
+        if args.all_articles:
+            articles = web_scraper.get_all_articles()
+        else:
+            single_article = web_scraper.get_one_article()
+            articles = [single_article] if single_article else []
+        
+        article_count = len(articles)
+
+        if articles:
+            if export_format == 'json':
+                web_scraper.export_to_JSON(articles)
+            else:
+                web_scraper.export_to_CSV(articles)
+            
+            print(f'Successfully exported {article_count} articles to {web_scraper.FILE_NAME}')
+        else:
+            print('No articles found to export.')
+            
+    except Exception as e:
+        print(f'Error: {e}')
+
+
+if __name__ == '__main__':
+    main()