diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b20f26 --- /dev/null +++ b/.gitignore @@ -0,0 +1,93 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +.venv/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject +.idea diff --git a/README.md b/README.md index fea29f4..62cc034 100644 --- a/README.md +++ b/README.md @@ -1 +1,21 @@ -# JD_Spider_python是一个python抓取京东商城的爬虫,没有使用框架,主要使用了requests,BeautifulSoup,threading,time,mysql数据库存储 +`JD_Spider_python`是一个python抓取京东商城的爬虫,没有使用框架,主要使用了requests,BeautifulSoup,threading,time,mysql数据库存储 + +# Usage + +## requirements + +``` +pip install requests +pip install MySQL-python +pip install beautifulsoup4 +``` + +## db schema +``` +CREATE DATABASE `jd_crawler` /*!40100 DEFAULT CHARACTER SET latin1 */; +CREATE TABLE `JD` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `img_url` varchar(1024) NOT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=142 DEFAULT CHARSET=latin1; +``` diff --git a/SQL.py b/SQL.py index d836a28..1742ed8 100644 --- a/SQL.py +++ b/SQL.py @@ -7,12 +7,12 @@ def __init__(self): self.user='root' self.password='root' self.host='localhost' - self.database='python' + self.database='jd_crawler' def get_connection(self): - return db.connect(user="root",passwd="root",host="localhost",db="python",charset="utf8") + return db.connect(user="root",passwd="root",host="127.0.0.1",port=3306,db="jd_crawler",charset="utf8") def save_img(self,url): conn=self.get_connection() cursor=conn.cursor() - cursor.execute("insert into JD(id,img_url) values(NULL,%s)",url) #将img_url插入到数据库中 + cursor.execute("insert into JD(img_url) values(%s)", [url]) #将img_url插入到数据库中 conn.commit()