Skip to content

Commit 0744132

Browse files
author
yitian
committed
添加了scrapy爬虫框架的例子
1 parent 2a860d2 commit 0744132

File tree

11 files changed

+428
-1
lines changed

11 files changed

+428
-1
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,7 @@
1212

1313
添加了flask的简单例子,演示了基本使用方法。
1414

15-
![运行截图](flask-sample/flask-sample.PNG)
15+
![运行截图](flask-sample/flask-sample.PNG)
16+
17+
## scrapy_sample
18+
添加了Scrapy爬虫框架的例子。

scrapy_sample/.gitignore

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# Created by .ignore support plugin (hsz.mobi)
2+
### Eclipse template
3+
4+
.metadata
5+
bin/
6+
tmp/
7+
*.tmp
8+
*.bak
9+
*.swp
10+
*~.nib
11+
local.properties
12+
.settings/
13+
.loadpath
14+
.recommenders
15+
16+
# Eclipse Core
17+
.project
18+
19+
# External tool builders
20+
.externalToolBuilders/
21+
22+
# Locally stored "Eclipse launch configurations"
23+
*.launch
24+
25+
# PyDev specific (Python IDE for Eclipse)
26+
*.pydevproject
27+
28+
# CDT-specific (C/C++ Development Tooling)
29+
.cproject
30+
31+
# JDT-specific (Eclipse Java Development Tools)
32+
.classpath
33+
34+
# Java annotation processor (APT)
35+
.factorypath
36+
37+
# PDT-specific (PHP Development Tools)
38+
.buildpath
39+
40+
# sbteclipse plugin
41+
.target
42+
43+
# Tern plugin
44+
.tern-project
45+
46+
# TeXlipse plugin
47+
.texlipse
48+
49+
# STS (Spring Tool Suite)
50+
.springBeans
51+
52+
# Code Recommenders
53+
.recommenders/
54+
55+
# Scala IDE specific (Scala & Java development for Eclipse)
56+
.cache-main
57+
.scala_dependencies
58+
.worksheet
59+
### Python template
60+
# Byte-compiled / optimized / DLL files
61+
__pycache__/
62+
*.py[cod]
63+
*$py.class
64+
65+
# C extensions
66+
*.so
67+
68+
# Distribution / packaging
69+
.Python
70+
env/
71+
build/
72+
develop-eggs/
73+
dist/
74+
downloads/
75+
eggs/
76+
.eggs/
77+
lib/
78+
lib64/
79+
parts/
80+
sdist/
81+
var/
82+
wheels/
83+
*.egg-info/
84+
.installed.cfg
85+
*.egg
86+
87+
# PyInstaller
88+
# Usually these files are written by a python script from a template
89+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
90+
*.manifest
91+
*.spec
92+
93+
# Installer logs
94+
pip-log.txt
95+
pip-delete-this-directory.txt
96+
97+
# Unit test / coverage reports
98+
htmlcov/
99+
.tox/
100+
.coverage
101+
.coverage.*
102+
.cache
103+
nosetests.xml
104+
coverage.xml
105+
*,cover
106+
.hypothesis/
107+
108+
# Translations
109+
*.mo
110+
*.pot
111+
112+
# Django stuff:
113+
*.log
114+
local_settings.py
115+
116+
# Flask stuff:
117+
instance/
118+
.webassets-cache
119+
120+
# Scrapy stuff:
121+
.scrapy
122+
123+
# Sphinx documentation
124+
docs/_build/
125+
126+
# PyBuilder
127+
target/
128+
129+
# Jupyter Notebook
130+
.ipynb_checkpoints
131+
132+
# pyenv
133+
.python-version
134+
135+
# celery beat schedule file
136+
celerybeat-schedule
137+
138+
# SageMath parsed files
139+
*.sage.py
140+
141+
# dotenv
142+
.env
143+
144+
# virtualenv
145+
.venv
146+
venv/
147+
ENV/
148+
149+
# Spyder project settings
150+
.spyderproject
151+
152+
# Rope project settings
153+
.ropeproject
154+
### Example user template template
155+
### Example user template
156+
157+
# IntelliJ project files
158+
.idea
159+
*.iml
160+
out
161+
gen### JetBrains template
162+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
163+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
164+
165+
# User-specific stuff:
166+
.idea/**/workspace.xml
167+
.idea/**/tasks.xml
168+
.idea/dictionaries
169+
170+
# Sensitive or high-churn files:
171+
.idea/**/dataSources/
172+
.idea/**/dataSources.ids
173+
.idea/**/dataSources.xml
174+
.idea/**/dataSources.local.xml
175+
.idea/**/sqlDataSources.xml
176+
.idea/**/dynamic.xml
177+
.idea/**/uiDesigner.xml
178+
179+
# Gradle:
180+
.idea/**/gradle.xml
181+
.idea/**/libraries
182+
183+
# Mongo Explorer plugin:
184+
.idea/**/mongoSettings.xml
185+
186+
## File-based project format:
187+
*.iws
188+
189+
## Plugin-specific files:
190+
191+
# IntelliJ
192+
/out/
193+
194+
# mpeltonen/sbt-idea plugin
195+
.idea_modules/
196+
197+
# JIRA plugin
198+
atlassian-ide-plugin.xml
199+
200+
# Crashlytics plugin (for Android Studio and IntelliJ)
201+
com_crashlytics_export_strings.xml
202+
crashlytics.properties
203+
crashlytics-build.properties
204+
fabric.properties
205+

scrapy_sample/scrapy.cfg

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[settings]
2+
default = scrapy_sample.settings
3+
4+
[deploy]
5+
#url = http://localhost:6800/
6+
project = scrapy_sample

scrapy_sample/scrapy_sample/__init__.py

Whitespace-only changes.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
class ScrapySampleItem(scrapy.Item):
12+
# define the fields for your item here like:
13+
# name = scrapy.Field()
14+
pass
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
10+
11+
class ScrapySampleSpiderMiddleware(object):
12+
# Not all methods need to be defined. If a method is not defined,
13+
# scrapy acts as if the spider middleware does not modify the
14+
# passed objects.
15+
16+
@classmethod
17+
def from_crawler(cls, crawler):
18+
# This method is used by Scrapy to create your spiders.
19+
s = cls()
20+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21+
return s
22+
23+
def process_spider_input(response, spider):
24+
# Called for each response that goes through the spider
25+
# middleware and into the spider.
26+
27+
# Should return None or raise an exception.
28+
return None
29+
30+
def process_spider_output(response, result, spider):
31+
# Called with the results returned from the Spider, after
32+
# it has processed the response.
33+
34+
# Must return an iterable of Request, dict or Item objects.
35+
for i in result:
36+
yield i
37+
38+
def process_spider_exception(response, exception, spider):
39+
# Called when a spider or process_spider_input() method
40+
# (from other spider middleware) raises an exception.
41+
42+
# Should return either None or an iterable of Response, dict
43+
# or Item objects.
44+
pass
45+
46+
def process_start_requests(start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
for r in start_requests:
53+
yield r
54+
55+
def spider_opened(self, spider):
56+
spider.logger.info('Spider opened: %s' % spider.name)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
9+
class ScrapySamplePipeline(object):
10+
def process_item(self, item, spider):
11+
return item
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for scrapy_sample project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10+
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'scrapy_sample'
13+
14+
SPIDER_MODULES = ['scrapy_sample.spiders']
15+
NEWSPIDER_MODULE = 'scrapy_sample.spiders'
16+
FEED_EXPORT_ENCODING = 'utf-8'
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
# USER_AGENT = 'scrapy_sample (+http://www.yourdomain.com)'
20+
21+
# Obey robots.txt rules
22+
ROBOTSTXT_OBEY = True
23+
24+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
25+
# CONCURRENT_REQUESTS = 32
26+
27+
# Configure a delay for requests for the same website (default: 0)
28+
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29+
# See also autothrottle settings and docs
30+
# DOWNLOAD_DELAY = 3
31+
# The download delay setting will honor only one of:
32+
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
33+
# CONCURRENT_REQUESTS_PER_IP = 16
34+
35+
# Disable cookies (enabled by default)
36+
# COOKIES_ENABLED = False
37+
38+
# Disable Telnet Console (enabled by default)
39+
# TELNETCONSOLE_ENABLED = False
40+
41+
# Override the default request headers:
42+
# DEFAULT_REQUEST_HEADERS = {
43+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44+
# 'Accept-Language': 'en',
45+
# }
46+
47+
# Enable or disable spider middlewares
48+
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49+
# SPIDER_MIDDLEWARES = {
50+
# 'scrapy_sample.middlewares.ScrapySampleSpiderMiddleware': 543,
51+
# }
52+
53+
# Enable or disable downloader middlewares
54+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55+
# DOWNLOADER_MIDDLEWARES = {
56+
# 'scrapy_sample.middlewares.MyCustomDownloaderMiddleware': 543,
57+
# }
58+
59+
# Enable or disable extensions
60+
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61+
# EXTENSIONS = {
62+
# 'scrapy.extensions.telnet.TelnetConsole': None,
63+
# }
64+
65+
# Configure item pipelines
66+
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67+
# ITEM_PIPELINES = {
68+
# 'scrapy_sample.pipelines.ScrapySamplePipeline': 300,
69+
# }
70+
71+
# Enable and configure the AutoThrottle extension (disabled by default)
72+
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73+
# AUTOTHROTTLE_ENABLED = True
74+
# The initial download delay
75+
# AUTOTHROTTLE_START_DELAY = 5
76+
# The maximum download delay to be set in case of high latencies
77+
# AUTOTHROTTLE_MAX_DELAY = 60
78+
# The average number of requests Scrapy should be sending in parallel to
79+
# each remote server
80+
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81+
# Enable showing throttling stats for every response received:
82+
# AUTOTHROTTLE_DEBUG = False
83+
84+
# Enable and configure HTTP caching (disabled by default)
85+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86+
# HTTPCACHE_ENABLED = True
87+
# HTTPCACHE_EXPIRATION_SECS = 0
88+
# HTTPCACHE_DIR = 'httpcache'
89+
# HTTPCACHE_IGNORE_HTTP_CODES = []
90+
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

0 commit comments

Comments
 (0)