知乎爬虫-指定问题采集全回答|技术支持|武汉网盾科技有限公司|湖北地区互联网混合云分布式云计算和高防IDC解决方案服务商

知乎爬虫-指定问题采集全回答发布时间：2020/4/5 10:08:18 阅读次数：

知乎爬虫-指定问题采集全回答

知乎回答提取程序

只需运行主程序html.py即可。书籍信息保存在read2.txt中源代码保存已注释。

from selenium import webdriver
import requests
import re
from lxml import etree
from selenium.webdriver import ChromeOptions  # 需要导入的类
from selenium.webdriver.common.keys import Keys
import time
import tiqu

# 记录网页源代码
def write2(html, name):
    path = "./data/" + name + '.txt'
    with open(path, 'w', encoding='utf-8')as file:
        file.write(html)
        file.close()

def socket_get():
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 代理
   # option.add_argument('--proxy-server=http://127.0.0.1:9090')
    socket = webdriver.Chrome('C:/Users/31114/PycharmProjects/chromedriver.exe', options=option)
    return socket
def sele(url, name, dict):
    socket = socket_get()
    socket.get(url)
    tingzhi = "写回答&lt;/button>"
    tingzhi2 = "写第一个回答&lt;/button>"
    # 暴力下拉页面
   last_html = ""
    for i in range(1, 1000):
        js = "var q=document.documentElement.scrollTop=1000000" # js下拉到底端
       socket.execute_script(js)
        html = socket.page_source # 获取源代码
       #write2(html, name) # 写入name.txt
        strd = re.findall(tingzhi, html) # 寻找结束条件
       strd2 = re.findall(tingzhi2,html) # 寻找结束条件
       if len(strd) +len(strd2)>=3: # 满足条件跳出循环
           last_html = html
            break
        # print(str)
        # write(str)
        time.sleep(1)
    # write2(last_html,name) # 写入name.txt
    # 若在当前位置提取，如要保存网页源代码则去掉注释，并在上方加上注释
   if last_html != "":
        tiqu.find(last_html,name+'.txt',dict)
    print(url+"已提取完毕")
    time.sleep(3)
    pass


if __name__ == "__main__":
    url2 = "https://www.zhihu.com/question/345473425"
    #url ="https://www.zhihu.com/question/374501668"
    old_vis = tiqu.get_old_vis()  # 获取曾经提取过的页面名字
   dict = tiqu.get_old_books()  # 获取曾经提取过的数据
   html_name = re.findall('([0-9]{5,11})', url2) # 网页编号
   print(html_name[0])
    html_name = html_name[0]
    if html_name+'.txt'+'\n' not in old_vis:
        sele(url=url2, name=html_name,dict=dict)

下面代码保存名字为tiqu.py

import re
import os


# 读取网页源码
def read(html, dict, old_vis):
    # 判断是否查找过
   if html + '\n' in old_vis:
        return
    # 打开文档
   with open('./data/' + html, 'r', encoding='utf-8')as file:
        str = file.readlines()
        # print(str)
        file.close()
    strr = ""
    for ind in str:
        strr += ind
    find(strr, html, dict)


def find(strr, html, dict):
    books = re.findall('《(.*?)》', strr)
    if books != []:
        # print(books)
        tongji(books, dict)
        # print(html)
        write_vis(html)


def tongji(books, dict):
    for book in books:
        book = re.sub('&lt;.*?>', "", book)  # 去除多余网页源码
       # print(dict[book])
        # 如果不存在
       if book not in dict.keys():
            dict[book] = 1
        else:
            dict[book] += 1
        pass
    # 排序
   aps = sorted(dict.items(), key=lambda d: d[1], reverse=True)
    # print(dict)
    # print(aps[0][0])
    # 写入read2文档
   write(aps)


def write(aps):
    print(aps)
    lend = len(aps)
    print("当前有" + str(lend) + "本书")
    with open('./read2.txt', 'w', encoding='utf-8')as file:
        for i in aps:
            file.write(i[0] + "\t" + str(i[1]) + '\n')
        file.close()


# 获取data下所有网页源码名字
def all_html():
    list = os.listdir('./data')
    return list


# 获取曾经提取过的数据
def get_old_books():
    old_dic_books = {}
    with open('./read2.txt', 'r', encoding='utf-8')as file:
        list = file.readlines()
        file.close()
    # 将list转为dic
    for book in list:
        book.replace('\n', '')
        try:
            old_dic_books[book.split('\t')[0]] = int(book.split('\t')[1].replace('\n', ''))
        except Exception as e:
            print(e)
    return old_dic_books


def write_vis(html):
    with open('./vis.txt', 'a', encoding='utf-8')as file:
        if html != "":
           file.write(html + '\n')
        else:
            file.write('')
        file.close()


# 获取曾经提取过的页面名字
def get_old_vis():
    write_vis("")
    with open('./vis.txt', 'r', encoding='utf-8')as file:
        list = file.readlines()
        file.close()
    return list


if __name__ == "__main__":
    old_vis = get_old_vis()  # 获取曾经提取过的页面名字
   dict = get_old_books()  # 获取曾经提取过的数据
   list = all_html()  # 获取data下所有网页源码名字
   for html in list:
        read(html, dict, old_vis)  # 读取网页源码

运行即可提取其中是书名的部分

返回上级菜单

知乎爬虫-指定问题采集全回答发布时间：2020/4/5 10:08:18 阅读次数：

产品

新闻中心

关于我们

联系我们