知乎爬虫-指定问题采集全回答发布时间:2020/4/5 10:08:18 阅读次数:

  

知乎爬虫-指定问题采集全回答

知乎回答提取程序

只需运行主程序html.py即可。书籍信息保存在read2.txt中 源代码保存已注释。

from selenium import webdriver
import requests
import re
from lxml import etree
from selenium.webdriver import ChromeOptions  # 需要导入的类
from selenium.webdriver.common.keys import Keys
import time
import tiqu

# 记录网页源代码
def write2(html, name):
   path = "./data/" + name + '.txt'
   with open(path, 'w', encoding='utf-8')as file:
       file.write(html)
       file.close()

def socket_get():
   option = ChromeOptions()
   option.add_experimental_option('excludeSwitches', ['enable-automation'])
   # 代理
   # option.add_argument('--proxy-server=http://127.0.0.1:9090')
   socket = webdriver.Chrome('C:/Users/31114/PycharmProjects/chromedriver.exe', options=option)
   return socket
def sele(url, name, dict):
   socket = socket_get()
   socket.get(url)
   tingzhi = "写回答</button>"
   tingzhi2 = "写第一个回答</button>"
   # 暴力下拉页面
   last_html = ""
   for i in range(1, 1000):
       js = "var q=document.documentElement.scrollTop=1000000" # js下拉到底端
       socket.execute_script(js)
       html = socket.page_source # 获取源代码
       #write2(html, name) # 写入name.txt
       strd = re.findall(tingzhi, html) # 寻找结束条件
       strd2 = re.findall(tingzhi2,html) # 寻找结束条件
       if len(strd) +len(strd2)>=3: # 满足条件跳出循环
           last_html = html
           break
       # print(str)
       # write(str)
       time.sleep(1)
   # write2(last_html,name) # 写入name.txt
   # 若在当前位置提取,如要保存网页源代码则去掉注释,并在上方加上注释
   if last_html != "":
       tiqu.find(last_html,name+'.txt',dict)
   print(url+"已提取完毕")
   time.sleep(3)
   pass


if __name__ == "__main__":
   url2 = "https://www.zhihu.com/question/345473425"
   #url ="https://www.zhihu.com/question/374501668"
   old_vis = tiqu.get_old_vis()  # 获取曾经提取过的页面名字
   dict = tiqu.get_old_books()  # 获取曾经提取过的数据
   html_name = re.findall('([0-9]{5,11})', url2) # 网页编号
   print(html_name[0])
   html_name = html_name[0]
   if html_name+'.txt'+'\n' not in old_vis:
       sele(url=url2, name=html_name,dict=dict)

下面代码保存名字为tiqu.py

import re
import os


# 读取网页源码
def read(html, dict, old_vis):
   # 判断是否查找过
   if html + '\n' in old_vis:
       return
   # 打开文档
   with open('./data/' + html, 'r', encoding='utf-8')as file:
       str = file.readlines()
       # print(str)
       file.close()
   strr = ""
   for ind in str:
       strr += ind
   find(strr, html, dict)


def find(strr, html, dict):
   books = re.findall('《(.*?)》', strr)
   if books != []:
       # print(books)
       tongji(books, dict)
       # print(html)
       write_vis(html)


def tongji(books, dict):
   for book in books:
       book = re.sub('<.*?>', "", book)  # 去除多余网页源码
       # print(dict[book])
       # 如果不存在
       if book not in dict.keys():
           dict[book] = 1
       else:
           dict[book] += 1
       pass
   # 排序
   aps = sorted(dict.items(), key=lambda d: d[1], reverse=True)
   # print(dict)
   # print(aps[0][0])
   # 写入read2文档
   write(aps)


def write(aps):
   print(aps)
   lend = len(aps)
   print("当前有" + str(lend) + "本书")
   with open('./read2.txt', 'w', encoding='utf-8')as file:
       for i in aps:
           file.write(i[0] + "\t" + str(i[1]) + '\n')
       file.close()


# 获取data下所有网页源码名字
def all_html():
   list = os.listdir('./data')
   return list


# 获取曾经提取过的数据
def get_old_books():
   old_dic_books = {}
   with open('./read2.txt', 'r', encoding='utf-8')as file:
       list = file.readlines()
       file.close()
   # 将list转为dic
   for book in list:
       book.replace('\n', '')
       try:
           old_dic_books[book.split('\t')[0]] = int(book.split('\t')[1].replace('\n', ''))
       except Exception as e:
           print(e)
   return old_dic_books


def write_vis(html):
   with open('./vis.txt', 'a', encoding='utf-8')as file:
       if html != "":
           file.write(html + '\n')
       else:
           file.write('')
       file.close()


# 获取曾经提取过的页面名字
def get_old_vis():
   write_vis("")
   with open('./vis.txt', 'r', encoding='utf-8')as file:
       list = file.readlines()
       file.close()
   return list


if __name__ == "__main__":
   old_vis = get_old_vis()  # 获取曾经提取过的页面名字
   dict = get_old_books()  # 获取曾经提取过的数据
   list = all_html()  # 获取data下所有网页源码名字
   for html in list:
       read(html, dict, old_vis)  # 读取网页源码

 14tfe6ugws.png

运行即可提取其中是书名的部分