diff --git a/.idea/misc.xml b/.idea/misc.xml
index d1e22ec..6b7f31a 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/.idea/spider.iml b/.idea/spider.iml
index 0e4e9fa..102589c 100644
--- a/.idea/spider.iml
+++ b/.idea/spider.iml
@@ -4,7 +4,7 @@
-
+
\ No newline at end of file
diff --git a/__pycache__/request.cpython-39.pyc b/__pycache__/request.cpython-39.pyc
index 099c621..db6f30c 100644
Binary files a/__pycache__/request.cpython-39.pyc and b/__pycache__/request.cpython-39.pyc differ
diff --git a/__pycache__/tools.cpython-39.pyc b/__pycache__/tools.cpython-39.pyc
index 0caa215..7894cec 100644
Binary files a/__pycache__/tools.cpython-39.pyc and b/__pycache__/tools.cpython-39.pyc differ
diff --git a/data/filmData.xls b/data/filmData.xls
deleted file mode 100644
index e918787..0000000
Binary files a/data/filmData.xls and /dev/null differ
diff --git "a/data/\350\256\260\345\275\225\347\210\254\350\231\253\346\227\266\351\227\264.txt" "b/data/\350\256\260\345\275\225\347\210\254\350\231\253\346\227\266\351\227\264.txt"
deleted file mode 100644
index 19a33d5..0000000
--- "a/data/\350\256\260\345\275\225\347\210\254\350\231\253\346\227\266\351\227\264.txt"
+++ /dev/null
@@ -1 +0,0 @@
-12/11 18:47
\ No newline at end of file
diff --git a/main.py b/main.py
index 85de0d3..c6421fc 100644
--- a/main.py
+++ b/main.py
@@ -1,10 +1,16 @@
import request
-import time
-req = request.Request()
-status = req.get(url="https://movie.douban.com/top250", start=25)
+import tools as t
+req = request.Request() # 实例化一个request模块中的Request类对象
+
+page = t.Utils.choosePage(page_number=2) # page_number:参数取值范围1~10, 默认为第一页
+
+# 通过Request对象req调用其中自定义的get()函数,目的是向豆瓣网服务器发送请求,会产生一个boolean类型的返回值,用于下面判断使用
+status = req.get(url="https://movie.douban.com/top250", page=page)
+
+# 设定数据保存路径,就是将网页爬取的数据保存至那个地方的放个excel数据表格中
save_path = ".\\data\\filmData.xls"
if status:
- req.do(path=save_path)
+ req.do(path=save_path) # 执行Request类中的do()函数
else:
print("出错!")
diff --git a/request.py b/request.py
index bbf6f52..91d776d 100644
--- a/request.py
+++ b/request.py
@@ -4,17 +4,18 @@
import xlrd
import time
+
class Request:
# 类成员html
html = ''
- def get(self, url, start):
+ def get(self, url, page):
# 模拟http头文件信息
head = {
"USER-Agent": 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome '
'/ 80.0.3987.122 Safari / 537.36 '
}
- url = url + '?start=' + str(start) # 组装url地址
+ url = url + '?start=' + str(page) # 组装url地址
html = requests.get(url, timeout=30, headers=head)
html.encoding = 'utf-8'
if html.status_code != 200:
@@ -25,10 +26,10 @@ def get(self, url, start):
def do(self, path):
obj = t.DataResolve(html=self.html) # 实例化DataResolve对象
- print("*"*20, "解析执行开始", "*"*20)
+ print("*" * 20, "解析执行开始", "*" * 20)
result = obj.resolve() # 执行解析
time.sleep(0.1)
- print("*"*20, "解析结束", "*"*20)
+ print("*" * 20, result, "*" * 20)
workbook = xlrd.open_workbook(path)
sheets_name = workbook.sheet_names()
sheet = workbook.sheet_by_name(sheet_name=sheets_name[0])
@@ -46,5 +47,7 @@ def do(self, path):
time.sleep(0.1)
print("*" * 20, "数据写入结束", "*" * 20)
dv = t.DataVisualization(excel=".\\data\\filmData.xls")
+ print(dv.Scatter2())
+ print(dv.treeMap())
+ print(dv.Scatter1())
print(dv.wordCloud())
-
diff --git a/tools.py b/tools.py
index f9a5079..9c0e6ac 100644
--- a/tools.py
+++ b/tools.py
@@ -8,7 +8,7 @@
from xlutils.copy import copy
import time
-from pyecharts.charts import Bar, WordCloud
+from pyecharts.charts import WordCloud
from pyecharts import options as opts
from pyecharts.charts import TreeMap
from pyecharts.globals import SymbolType
@@ -27,16 +27,31 @@ class DataResolve:
def __init__(self, html):
self.html = html
+ try:
+ xlrd.open_workbook(".\\data\\filmData.xls")
+ except:
+ # 初始化本地存储excel表格
+ print("filmData.xls文件不存在,创建filmData.xls文件")
+ workbook = xlwt.Workbook() # 创建工作簿对象
+ workbook.add_sheet("sheet1") # 创建工作表对象
+ workbook.save(".\\data\\filmData.xls")
+ try:
+ open(".\\data\\logs.txt")
+ except:
+ print("创建logs.txt日志文件")
+ open(".\\data\\logs.txt", 'w') # 初始化日志文件
+
'''
提供外部访问方法
'''
def resolve(self):
- data = self.__resolvePicture()
+ Utils.write_log() # 日志记录
+ picture = self.__resolvePicture()
score = self.__resolveScore()
film = self.__resolveEvaluations()
# self.__resolveFilmDownLoad()
- return ''
+ return '解析结束'
'''
解析图片
@@ -80,6 +95,7 @@ def __resolveScore(self):
print("影片评分解析成功")
return self.filmScore
+
'''
解析评价数
'''
@@ -152,7 +168,7 @@ def saveData(self, path):
将数据写入Excel中
'''
- def writeExcel(self, sheet_name, save_path):
+ def writeExcel(self, sheet_name, save_path='.\\data\\filmData.xls'):
workbook = xlwt.Workbook(encoding='ascii') # 创建工作簿对象
sheet = workbook.add_sheet(sheet_name) # 创建工作表对象
@@ -201,7 +217,7 @@ def writeExcelAppend(self, path):
sheets_name = workbook.sheet_names()
sheet = workbook.sheet_by_name(sheets_name[0])
rows = sheet.nrows
- new_workbook = copy(workbook)
+ new_workbook = copy(workbook) # xlrd对象转为xlwt对象
new_sheet = new_workbook.get_sheet(0)
dataStyle = xlwt.XFStyle() # 创建数据样式对象
dataAlign = xlwt.Alignment()
@@ -231,11 +247,13 @@ def writeExcelAppend(self, path):
class Utils:
- workbook = xlrd.open_workbook(".\\data\\filmData.xls")
- sheet = workbook.sheets()[0]
names = []
comments = []
- rows = int(sheet.nrows)
+
+ def __init__(self, excel_path):
+ workbook = xlrd.open_workbook(excel_path)
+ self.sheet = workbook.sheets()[0]
+ self.rows = int(self.sheet.nrows)
def chooseData(self):
# 随机从excel表中选出4个名称
@@ -245,6 +263,50 @@ def chooseData(self):
self.comments.append(self.sheet.cell_value(flag, 4))
return self.names, self.comments
+ @staticmethod
+ def choosePage(page_number=1):
+ print("爬取第" + str(page_number) + "页")
+ start = 0 # 实际起始位置
+ if page_number == 1: # 第一页
+ start = 0
+ return start
+ elif page_number == 2: # 第二页
+ start = 25
+ return start
+ elif page_number == 3: # 第三页
+ start = 50
+ return start
+ elif page_number == 4: # 第四页
+ start = 75
+ return start
+ elif page_number == 5: # 第五页
+ start = 100
+ return start
+ elif page_number == 6: # 第六页
+ start = 125
+ return start
+ elif page_number == 7: # 第七页
+ start = 150
+ return start
+ elif page_number == 8: # 第八页
+ start = 175
+ return start
+ elif page_number == 9: # 第九页
+ start = 200
+ return start
+ elif page_number == 10: # 第十页
+ start = 225
+ return start
+ else:
+ raise '页码不合法'
+
+ @staticmethod
+ def write_log():
+ log = open(".\\data\\logs.txt", 'a', encoding='utf-8')
+ currentTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
+ log.write('\n' + str(currentTime) + "爬取数据!")
+ log.close()
+
'''
生成可视化文件
@@ -266,7 +328,7 @@ def treeMap(self):
workbook = xlrd.open_workbook(self.path)
sheet = workbook.sheets()[0]
rows = sheet.nrows
- flag = 0
+ flag = 0 # excel表有效数据标识符
firstPage = []
secondPage = []
thirdPage = []
@@ -391,15 +453,15 @@ def Scatter1(self):
)
.render(".\\data\\scatter1.html")
)
+ return '散点图1生成成功'
'''
散点图2
'''
def Scatter2(self):
- u = Utils
- data = u.chooseData(u)
- print(data)
+ u = Utils(excel_path='.\\data\\filmData.xls')
+ data = u.chooseData()
c = (
Scatter()
.add_xaxis(u.names)
@@ -410,6 +472,4 @@ def Scatter2(self):
)
.render(".\\data\\scatter2.html")
)
-
-
-
+ return '散点图2生成成功'