知乎回答提取程序 只需运行主程序html.py即可。书籍信息保存在read2.txt中 源代码保存已注释。 下面代码保存名字为tiqu.py from selenium import webdriver
import requests
import re
from lxml import etree
from selenium.webdriver import ChromeOptions # 需要导入的类
from selenium.webdriver.common.keys import Keys
import time
import tiqu
# 记录网页源代码
def write2(html, name):
path = "./data/" + name + '.txt'
with open(path, 'w', encoding='utf-8')as file:
file.write(html)
file.close()
def socket_get():
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
# 代理
# option.add_argument('--proxy-server=http://127.0.0.1:9090')
socket = webdriver.Chrome('C:/Users/31114/PycharmProjects/chromedriver.exe', options=option)
return socket
def sele(url, name, dict):
socket = socket_get()
socket.get(url)
tingzhi = "写回答</button>"
tingzhi2 = "写第一个回答</button>"
# 暴力下拉页面
last_html = ""
for i in range(1, 1000):
js = "var q=document.documentElement.scrollTop=1000000" # js下拉到底端
socket.execute_script(js)
html = socket.page_source # 获取源代码
#write2(html, name) # 写入name.txt
strd = re.findall(tingzhi, html) # 寻找结束条件
strd2 = re.findall(tingzhi2,html) # 寻找结束条件
if len(strd) +len(strd2)>=3: # 满足条件跳出循环
last_html = html
break
# print(str)
# write(str)
time.sleep(1)
# write2(last_html,name) # 写入name.txt
# 若在当前位置提取,如要保存网页源代码则去掉注释,并在上方加上注释
if last_html != "":
tiqu.find(last_html,name+'.txt',dict)
print(url+"已提取完毕")
time.sleep(3)
pass
if __name__ == "__main__":
url2 = "https://www.zhihu.com/question/345473425"
#url ="https://www.zhihu.com/question/374501668"
old_vis = tiqu.get_old_vis() # 获取曾经提取过的页面名字
dict = tiqu.get_old_books() # 获取曾经提取过的数据
html_name = re.findall('([0-9]{5,11})', url2) # 网页编号
print(html_name[0])
html_name = html_name[0]
if html_name+'.txt'+'\n' not in old_vis:
sele(url=url2, name=html_name,dict=dict)import re
import os
# 读取网页源码
def read(html, dict, old_vis):
# 判断是否查找过
if html + '\n' in old_vis:
return
# 打开文档
with open('./data/' + html, 'r', encoding='utf-8')as file:
str = file.readlines()
# print(str)
file.close()
strr = ""
for ind in str:
strr += ind
find(strr, html, dict)
def find(strr, html, dict):
books = re.findall('《(.*?)》', strr)
if books != []:
# print(books)
tongji(books, dict)
# print(html)
write_vis(html)
def tongji(books, dict):
for book in books:
book = re.sub('<.*?>', "", book) # 去除多余网页源码
# print(dict[book])
# 如果不存在
if book not in dict.keys():
dict[book] = 1
else:
dict[book] += 1
pass
# 排序
aps = sorted(dict.items(), key=lambda d: d[1], reverse=True)
# print(dict)
# print(aps[0][0])
# 写入read2文档
write(aps)
def write(aps):
print(aps)
lend = len(aps)
print("当前有" + str(lend) + "本书")
with open('./read2.txt', 'w', encoding='utf-8')as file:
for i in aps:
file.write(i[0] + "\t" + str(i[1]) + '\n')
file.close()
# 获取data下所有网页源码名字
def all_html():
list = os.listdir('./data')
return list
# 获取曾经提取过的数据
def get_old_books():
old_dic_books = {}
with open('./read2.txt', 'r', encoding='utf-8')as file:
list = file.readlines()
file.close()
# 将list转为dic
for book in list:
book.replace('\n', '')
try:
old_dic_books[book.split('\t')[0]] = int(book.split('\t')[1].replace('\n', ''))
except Exception as e:
print(e)
return old_dic_books
def write_vis(html):
with open('./vis.txt', 'a', encoding='utf-8')as file:
if html != "":
file.write(html + '\n')
else:
file.write('')
file.close()
# 获取曾经提取过的页面名字
def get_old_vis():
write_vis("")
with open('./vis.txt', 'r', encoding='utf-8')as file:
list = file.readlines()
file.close()
return list
if __name__ == "__main__":
old_vis = get_old_vis() # 获取曾经提取过的页面名字
dict = get_old_books() # 获取曾经提取过的数据
list = all_html() # 获取data下所有网页源码名字
for html in list:
read(html, dict, old_vis) # 读取网页源码