Python 队列的使用

讨论 Rambo_gor
Lv5 宗师级炼丹师
发布在 Python编程   1348   0
讨论 Rambo_gor   1348   0

    Python 队列的使用

    下载网站小说并保存

    #coding:utf-8
    """
    Author:fanbinglin
    Date:  2020/01/28 21点22分
    功能:
    下载网站小说并保存
    http://www.xbiquge.la/5/5443/
    
    """
    import requests
    from lxml import etree
    import re
    from bs4 import BeautifulSoup as bf
    import os
    import time
    from queue import Queue
    import threading
    #创建条件变量对象
    condition_html = threading.Condition()
    condition_save = threading.Condition()
    # os.system("del *.txt")
    class downLoad:
        def __init__(self):
            self.header = {
            }
            self.url_queue = Queue()
            self.html_queue = Queue()
            self.save_queue = Queue()
            urlIndex = 'http://www.xbiquge.la/5/5443/'
            dns = 'http://www.xbiquge.la/'
            finishFlag = readFile('finishUrlList.txt')
            #  print(finishFlag)
            finishUrlList = []
            if finishFlag:
                finishUrlList = [x for x in finishFlag.split('\n') if x != ""]
            self.finishUrlList = finishUrlList
            totleFlag = readFile('totle.txt')
            if totleFlag:
                totleList = [x for x in totleFlag.split('\n') if x != ""]
            else:
                response = requests.get(urlIndex)
                response.encoding = response.apparent_encoding
                html = bf(response.text,'html.parser')
                aTagList = html.select('dd a')
                totleList = [urlIndex + str(y) + '.html' for y in sorted(list(set([int(x['href'].split('/')[-1].split(r'.')[0]) for x in aTagList])))]
                saveFile('totle.txt',"\n".join(totleList),'w+')
            self.urlList = [x for x in totleList if x not in finishUrlList]
            print(len(totleList)-len(self.urlList))
            time.sleep(1)
        def get_totle_url(self):
            [self.url_queue.put(x) for x in self.urlList] 
        def parse_html(self):
            global condition_html
            while self.url_queue.not_empty:
                condition_html.acquire()
                url = self.url_queue.get()
                response = requests.get(url)
                response.encoding = response.apparent_encoding
                html = bf(response.text,'html.parser')
                html.url = url
                self.html_queue.put(html)
                self.url_queue.task_done()
                condition_html.release()
        def save_content(self):
            global condition_save
            while self.html_queue.not_empty:
                condition_save.acquire()
                html = self.html_queue.get()
                try:
                    content = html.select_one('#content').text.replace(' ','') + '\n'
                    url = html.url
                    print(url)
                    indexId = url.split(r'/')[-1]
                    with open(indexId + '.txt','w+',encoding = 'utf-8') as f:
                        f.write(content)
                        f.close()
                        saveFile('finishUrlList.txt',url + '\n','a+')
                except:
                    print("ERROR")
                self.html_queue.task_done()
                condition_save.release()
        def run(self):
            thread_list = []
            thread_url = threading.Thread(target=self.get_totle_url)
            thread_list.append(thread_url)
            for i in range(4):
                thread_parse = threading.Thread(target=self.parse_html)
                thread_list.append(thread_parse)
            for k in range(15):
                thread_save = threading.Thread(target=self.save_content)
                thread_list.append(thread_save)
            thread_list[0].setDaemon(True)
            thread_list[0].start()
            time.sleep(2)
            for t in thread_list[1:]:
                t.setDaemon(True)
                t.start()
                print(t.name)
            self.url_queue.join()
            self.html_queue.join()
            self.save_queue.join()
    
    def saveFile(filename,content,mode):
        try:
            #with open(filename,mode,encoding='utf-8') as f:#二进制文件不能保存为utf-8格式
            with open(filename,mode,encoding='utf-8') as f:
                f.write(content)
                f.close()
                print('\t'+filename + "保存成功!")
        except:
            print('\t'+filename + "保存失败!")
            f.close()
            
    def readFile(filename):
        try:
            with open(filename,"r",encoding='utf-8') as f:
                return f.read()
        except:
            return False
    if __name__ == "__main__":
        mode = 0
        if mode == 1:
            txtList = [x for x in os.listdir() if x[0:6].isdigit()]
            saveFile('panlong.txt',''.join([readFile(x) for x in txtList]),'w')
        else:
            startTime = time.time()
            story = downLoad()
            story.run()
            endTime = time.time()
            print(endTime-startTime)
    版权声明:作者保留权利,不代表意本站立场。如需转载请联系本站以及作者。

    参与讨论

    回复《 Python 队列的使用

    EditorJs 编辑器

    沙发,很寂寞~
    反馈
    to-top--btn