- Python 3.x
- libssl-dev
- git clone
git clone https://github.com/GundamBox/PttCrawler.git
- change directory
cd PttCrawler
- Check Python and Pip Version
Must use Python3
sudo apt-get install python3 python3-pip
- Install Package
sudo apt-get install libssl-dev
sudo pip3 install -r requirements.txt
- Copy
config_example.ini
asconfig.ini
cp config_example.ini config.ini
- Upgrade SQLite Database
alembic upgrade head
[Database]
# Database Url: [Type]://[Name]
# Currently only support SQLite
Type = sqlite
Name = ptt.db
[PttUser]
# term.ptt.cc every action delaytime
Delaytime = 2
# selenium webdriver folder
WebdriverFolder = webdriver
# term.ptt.cc bot login id/password
UserId = guest
UserPwd = guest
# Choices = {database, json, both}
Output = both
[PttArticle]
# Delaytime: delay time for each article
# NextPageDelaytime: delay time for each index
Delaytime = 2.0
NextPageDelaytime = 10.0
# request timeout
Timeout = 10
# Choices = {database, json, both}
Output = both
# The article history keeps at most 30 versions.
python init_db.py
-
PTT Article index
python -m crawler article_index --board-name BOARD_NAME \ [--before | --after] [--index INDEX]
-
PTT Article
python -m crawler article --board-name BOARD_NAME \ (--start-date | --index START_INDEX END_INDEX | --database) \ (--add | --upgrade) \ [--config-path CONFIG_PATH]
-
PTT User last login record
python -m crawler user (--database | --id ID) [--config-path CONFIG_PATH]
-
PTT Ip autonomous system number
python -m crawler asn (--database | --ip IP) [--config-path CONFIG_PATH]
Export in file with ods, csv or json file format
python export.py --format {ods, csv, json} --output-folder OUTPUT_FOLDER [--output-prefix OUTPUT_PREFIX]
- Update
python schedule.py update {article, asn, user} -c CYCLE_TIME [-s START_DATETIME] [--virtualenv VIRTUALENV_PATH]
- Remove
python schedule.py remove {article, asn, user}
-
windows
# init_db.exe pyinstaller -F --clean ^ --hidden-import logging.config ^ --hidden-import typing ^ --hidden-import sqlalchemy.ext.declarative ^ init_db.py # export.exe pyinstaller -F --clean ^ --hidden-import pyexcel_io.readers ^ --hidden-import pyexcel_io.writers ^ --hidden-import pyexcel_io.database ^ --hidden-import pyexcel_ods3.odsw ^ --hidden-import sqlalchemy.ext.baked ^ export.py # query.exe pyinstaller -F --clean ^ --hidden-import pyexcel_io.readers ^ --hidden-import pyexcel_io.writers ^ --hidden-import pyexcel_io.database ^ --hidden-import pyexcel_ods3.odsw ^ --hidden-import sqlalchemy.ext.baked ^ query.py # schedule.exe # `python-crontab` is not working on windows # Todo: search other package to replace `python-crontab` pyinstaller -F --clean ^ schedule.py # crawler.exe pyinstaller -F --clean ^ --hidden-import sqlalchemy.ext.baked ^ --name crawler.exe ^ crawler\__main__.py
-
linux
# init_db pyinstaller -F --clean \ --hidden-import logging.config \ --hidden-import typing \ --hidden-import sqlalchemy.ext.declarative \ init_db.py # export pyinstaller -F --clean \ --hidden-import pyexcel_io.readers \ --hidden-import pyexcel_io.writers \ --hidden-import pyexcel_io.database \ --hidden-import pyexcel_ods3.odsw \ --hidden-import sqlalchemy.ext.baked \ export.py # query pyinstaller -F --clean \ --hidden-import pyexcel_io.readers \ --hidden-import pyexcel_io.writers \ --hidden-import pyexcel_io.database \ --hidden-import pyexcel_ods3.odsw \ --hidden-import sqlalchemy.ext.baked \ query.py # schedule pyinstaller -F --clean \ schedule.py # crawler pyinstaller -F --clean \ --hidden-import sqlalchemy.ext.baked \ --name crawler \ crawler/__main__.py
PttCrawler/
├── CHANGELOG.md
├── config_example.ini
├── init_db.py
├── crawler/
│ ├── __init__.py
│ ├── __main__.py
│ ├── crawler_arg.py
│ ├── article_index.py
│ ├── article.py
│ ├── asn.py
│ └── user.py
├── db_migration/
│ ├── env.py
│ ├── README
│ ├── script.py.mako
│ └── versions/
│ ├── 77eaebfa8062_create_initial_table.py
│ ├── 64f93945c28a_edit_article_table.py
│ ├── 6794412e2720_edit_article_history_on_delete_actions.py
| └── 3af39c6792c0_edit_datetime_nullable.py
├── doc/
│ ├── img/
│ ├── en.md
│ └── zh.md
├── models/
│ ├── __init__.py
│ ├── base.py
│ ├── article.py
│ ├── asn.py
│ └── user.py
│── webdriver/
├── env_wrapper.sh
├── export.py
├── query.py
├── schedule.py
├── utils.py
├── requirements.txt
└── README.md