Python 队列的使用
下载网站小说并保存
#coding:utf-8
"""
Author:fanbinglin
Date: 2020/01/28 21点22分
功能:
下载网站小说并保存
http://www.xbiquge.la/5/5443/
"""
import requests
from lxml import etree
import re
from bs4 import BeautifulSoup as bf
import os
import time
from queue import Queue
import threading
#创建条件变量对象
condition_html = threading.Condition()
condition_save = threading.Condition()
# os.system("del *.txt")
class downLoad:
def __init__(self):
self.header = {
}
self.url_queue = Queue()
self.html_queue = Queue()
self.save_queue = Queue()
urlIndex = 'http://www.xbiquge.la/5/5443/'
dns = 'http://www.xbiquge.la/'
finishFlag = readFile('finishUrlList.txt')
# print(finishFlag)
finishUrlList = []
if finishFlag:
finishUrlList = [x for x in finishFlag.split('\n') if x != ""]
self.finishUrlList = finishUrlList
totleFlag = readFile('totle.txt')
if totleFlag:
totleList = [x for x in totleFlag.split('\n') if x != ""]
else:
response = requests.get(urlIndex)
response.encoding = response.apparent_encoding
html = bf(response.text,'html.parser')
aTagList = html.select('dd a')
totleList = [urlIndex + str(y) + '.html' for y in sorted(list(set([int(x['href'].split('/')[-1].split(r'.')[0]) for x in aTagList])))]
saveFile('totle.txt',"\n".join(totleList),'w+')
self.urlList = [x for x in totleList if x not in finishUrlList]
print(len(totleList)-len(self.urlList))
time.sleep(1)
def get_totle_url(self):
[self.url_queue.put(x) for x in self.urlList]
def parse_html(self):
global condition_html
while self.url_queue.not_empty:
condition_html.acquire()
url = self.url_queue.get()
response = requests.get(url)
response.encoding = response.apparent_encoding
html = bf(response.text,'html.parser')
html.url = url
self.html_queue.put(html)
self.url_queue.task_done()
condition_html.release()
def save_content(self):
global condition_save
while self.html_queue.not_empty:
condition_save.acquire()
html = self.html_queue.get()
try:
content = html.select_one('#content').text.replace(' ','') + '\n'
url = html.url
print(url)
indexId = url.split(r'/')[-1]
with open(indexId + '.txt','w+',encoding = 'utf-8') as f:
f.write(content)
f.close()
saveFile('finishUrlList.txt',url + '\n','a+')
except:
print("ERROR")
self.html_queue.task_done()
condition_save.release()
def run(self):
thread_list = []
thread_url = threading.Thread(target=self.get_totle_url)
thread_list.append(thread_url)
for i in range(4):
thread_parse = threading.Thread(target=self.parse_html)
thread_list.append(thread_parse)
for k in range(15):
thread_save = threading.Thread(target=self.save_content)
thread_list.append(thread_save)
thread_list[0].setDaemon(True)
thread_list[0].start()
time.sleep(2)
for t in thread_list[1:]:
t.setDaemon(True)
t.start()
print(t.name)
self.url_queue.join()
self.html_queue.join()
self.save_queue.join()
def saveFile(filename,content,mode):
try:
#with open(filename,mode,encoding='utf-8') as f:#二进制文件不能保存为utf-8格式
with open(filename,mode,encoding='utf-8') as f:
f.write(content)
f.close()
print('\t'+filename + "保存成功!")
except:
print('\t'+filename + "保存失败!")
f.close()
def readFile(filename):
try:
with open(filename,"r",encoding='utf-8') as f:
return f.read()
except:
return False
if __name__ == "__main__":
mode = 0
if mode == 1:
txtList = [x for x in os.listdir() if x[0:6].isdigit()]
saveFile('panlong.txt',''.join([readFile(x) for x in txtList]),'w')
else:
startTime = time.time()
story = downLoad()
story.run()
endTime = time.time()
print(endTime-startTime)