diff --git a/.idea/misc.xml b/.idea/misc.xml index d1e22ec..6b7f31a 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/.idea/spider.iml b/.idea/spider.iml index 0e4e9fa..102589c 100644 --- a/.idea/spider.iml +++ b/.idea/spider.iml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/__pycache__/request.cpython-39.pyc b/__pycache__/request.cpython-39.pyc index 099c621..db6f30c 100644 Binary files a/__pycache__/request.cpython-39.pyc and b/__pycache__/request.cpython-39.pyc differ diff --git a/__pycache__/tools.cpython-39.pyc b/__pycache__/tools.cpython-39.pyc index 0caa215..7894cec 100644 Binary files a/__pycache__/tools.cpython-39.pyc and b/__pycache__/tools.cpython-39.pyc differ diff --git a/data/filmData.xls b/data/filmData.xls deleted file mode 100644 index e918787..0000000 Binary files a/data/filmData.xls and /dev/null differ diff --git "a/data/\350\256\260\345\275\225\347\210\254\350\231\253\346\227\266\351\227\264.txt" "b/data/\350\256\260\345\275\225\347\210\254\350\231\253\346\227\266\351\227\264.txt" deleted file mode 100644 index 19a33d5..0000000 --- "a/data/\350\256\260\345\275\225\347\210\254\350\231\253\346\227\266\351\227\264.txt" +++ /dev/null @@ -1 +0,0 @@ -12/11 18:47 \ No newline at end of file diff --git a/main.py b/main.py index 85de0d3..c6421fc 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,16 @@ import request -import time -req = request.Request() -status = req.get(url="https://movie.douban.com/top250", start=25) +import tools as t +req = request.Request() # 实例化一个request模块中的Request类对象 + +page = t.Utils.choosePage(page_number=2) # page_number:参数取值范围1~10, 默认为第一页 + +# 通过Request对象req调用其中自定义的get()函数,目的是向豆瓣网服务器发送请求,会产生一个boolean类型的返回值,用于下面判断使用 +status = req.get(url="https://movie.douban.com/top250", page=page) + +# 设定数据保存路径,就是将网页爬取的数据保存至那个地方的放个excel数据表格中 save_path = ".\\data\\filmData.xls" if status: - req.do(path=save_path) + req.do(path=save_path) # 执行Request类中的do()函数 else: print("出错!") diff --git a/request.py b/request.py index bbf6f52..91d776d 100644 --- a/request.py +++ b/request.py @@ -4,17 +4,18 @@ import xlrd import time + class Request: # 类成员html html = '' - def get(self, url, start): + def get(self, url, page): # 模拟http头文件信息 head = { "USER-Agent": 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome ' '/ 80.0.3987.122 Safari / 537.36 ' } - url = url + '?start=' + str(start) # 组装url地址 + url = url + '?start=' + str(page) # 组装url地址 html = requests.get(url, timeout=30, headers=head) html.encoding = 'utf-8' if html.status_code != 200: @@ -25,10 +26,10 @@ def get(self, url, start): def do(self, path): obj = t.DataResolve(html=self.html) # 实例化DataResolve对象 - print("*"*20, "解析执行开始", "*"*20) + print("*" * 20, "解析执行开始", "*" * 20) result = obj.resolve() # 执行解析 time.sleep(0.1) - print("*"*20, "解析结束", "*"*20) + print("*" * 20, result, "*" * 20) workbook = xlrd.open_workbook(path) sheets_name = workbook.sheet_names() sheet = workbook.sheet_by_name(sheet_name=sheets_name[0]) @@ -46,5 +47,7 @@ def do(self, path): time.sleep(0.1) print("*" * 20, "数据写入结束", "*" * 20) dv = t.DataVisualization(excel=".\\data\\filmData.xls") + print(dv.Scatter2()) + print(dv.treeMap()) + print(dv.Scatter1()) print(dv.wordCloud()) - diff --git a/tools.py b/tools.py index f9a5079..9c0e6ac 100644 --- a/tools.py +++ b/tools.py @@ -8,7 +8,7 @@ from xlutils.copy import copy import time -from pyecharts.charts import Bar, WordCloud +from pyecharts.charts import WordCloud from pyecharts import options as opts from pyecharts.charts import TreeMap from pyecharts.globals import SymbolType @@ -27,16 +27,31 @@ class DataResolve: def __init__(self, html): self.html = html + try: + xlrd.open_workbook(".\\data\\filmData.xls") + except: + # 初始化本地存储excel表格 + print("filmData.xls文件不存在,创建filmData.xls文件") + workbook = xlwt.Workbook() # 创建工作簿对象 + workbook.add_sheet("sheet1") # 创建工作表对象 + workbook.save(".\\data\\filmData.xls") + try: + open(".\\data\\logs.txt") + except: + print("创建logs.txt日志文件") + open(".\\data\\logs.txt", 'w') # 初始化日志文件 + ''' 提供外部访问方法 ''' def resolve(self): - data = self.__resolvePicture() + Utils.write_log() # 日志记录 + picture = self.__resolvePicture() score = self.__resolveScore() film = self.__resolveEvaluations() # self.__resolveFilmDownLoad() - return '' + return '解析结束' ''' 解析图片 @@ -80,6 +95,7 @@ def __resolveScore(self): print("影片评分解析成功") return self.filmScore + ''' 解析评价数 ''' @@ -152,7 +168,7 @@ def saveData(self, path): 将数据写入Excel中 ''' - def writeExcel(self, sheet_name, save_path): + def writeExcel(self, sheet_name, save_path='.\\data\\filmData.xls'): workbook = xlwt.Workbook(encoding='ascii') # 创建工作簿对象 sheet = workbook.add_sheet(sheet_name) # 创建工作表对象 @@ -201,7 +217,7 @@ def writeExcelAppend(self, path): sheets_name = workbook.sheet_names() sheet = workbook.sheet_by_name(sheets_name[0]) rows = sheet.nrows - new_workbook = copy(workbook) + new_workbook = copy(workbook) # xlrd对象转为xlwt对象 new_sheet = new_workbook.get_sheet(0) dataStyle = xlwt.XFStyle() # 创建数据样式对象 dataAlign = xlwt.Alignment() @@ -231,11 +247,13 @@ def writeExcelAppend(self, path): class Utils: - workbook = xlrd.open_workbook(".\\data\\filmData.xls") - sheet = workbook.sheets()[0] names = [] comments = [] - rows = int(sheet.nrows) + + def __init__(self, excel_path): + workbook = xlrd.open_workbook(excel_path) + self.sheet = workbook.sheets()[0] + self.rows = int(self.sheet.nrows) def chooseData(self): # 随机从excel表中选出4个名称 @@ -245,6 +263,50 @@ def chooseData(self): self.comments.append(self.sheet.cell_value(flag, 4)) return self.names, self.comments + @staticmethod + def choosePage(page_number=1): + print("爬取第" + str(page_number) + "页") + start = 0 # 实际起始位置 + if page_number == 1: # 第一页 + start = 0 + return start + elif page_number == 2: # 第二页 + start = 25 + return start + elif page_number == 3: # 第三页 + start = 50 + return start + elif page_number == 4: # 第四页 + start = 75 + return start + elif page_number == 5: # 第五页 + start = 100 + return start + elif page_number == 6: # 第六页 + start = 125 + return start + elif page_number == 7: # 第七页 + start = 150 + return start + elif page_number == 8: # 第八页 + start = 175 + return start + elif page_number == 9: # 第九页 + start = 200 + return start + elif page_number == 10: # 第十页 + start = 225 + return start + else: + raise '页码不合法' + + @staticmethod + def write_log(): + log = open(".\\data\\logs.txt", 'a', encoding='utf-8') + currentTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + log.write('\n' + str(currentTime) + "爬取数据!") + log.close() + ''' 生成可视化文件 @@ -266,7 +328,7 @@ def treeMap(self): workbook = xlrd.open_workbook(self.path) sheet = workbook.sheets()[0] rows = sheet.nrows - flag = 0 + flag = 0 # excel表有效数据标识符 firstPage = [] secondPage = [] thirdPage = [] @@ -391,15 +453,15 @@ def Scatter1(self): ) .render(".\\data\\scatter1.html") ) + return '散点图1生成成功' ''' 散点图2 ''' def Scatter2(self): - u = Utils - data = u.chooseData(u) - print(data) + u = Utils(excel_path='.\\data\\filmData.xls') + data = u.chooseData() c = ( Scatter() .add_xaxis(u.names) @@ -410,6 +472,4 @@ def Scatter2(self): ) .render(".\\data\\scatter2.html") ) - - - + return '散点图2生成成功'