惯性聚合 高效追踪和阅读你感兴趣的博客、新闻、科技资讯
阅读原文 在惯性聚合中打开

推荐订阅源

宝玉的分享
宝玉的分享
NISL@THU
NISL@THU
E
Exploit-DB.com RSS Feed
L
LINUX DO - 热门话题
L
Lohrmann on Cybersecurity
K
Kaspersky official blog
Project Zero
Project Zero
Cisco Talos Blog
Cisco Talos Blog
T
The Exploit Database - CXSecurity.com
P
Palo Alto Networks Blog
C
CXSECURITY Database RSS Feed - CXSecurity.com
T
Threatpost
S
Schneier on Security
G
GRAHAM CLULEY
The Hacker News
The Hacker News
T
Threat Research - Cisco Blogs
Scott Helme
Scott Helme
Threat Intelligence Blog | Flashpoint
Threat Intelligence Blog | Flashpoint
P
Privacy & Cybersecurity Law Blog
C
Cyber Attacks, Cyber Crime and Cyber Security
Cyberwarzone
Cyberwarzone
C
CERT Recently Published Vulnerability Notes
T
Tor Project blog
AWS News Blog
AWS News Blog
Simon Willison's Weblog
Simon Willison's Weblog
cs.CL updates on arXiv.org
cs.CL updates on arXiv.org
爱范儿
爱范儿
P
Privacy International News Feed
云风的 BLOG
云风的 BLOG
P
Proofpoint News Feed
S
Securelist
G
Google Developers Blog
The Last Watchdog
The Last Watchdog
Google Online Security Blog
Google Online Security Blog
美团技术团队
F
Fortinet All Blogs
小众软件
小众软件
Recorded Future
Recorded Future
V
Visual Studio Blog
B
Blog RSS Feed
H
Help Net Security
CTFtime.org: upcoming CTF events
CTFtime.org: upcoming CTF events
Google DeepMind News
Google DeepMind News
Blog — PlanetScale
Blog — PlanetScale
博客园 - 聂微东
Stack Overflow Blog
Stack Overflow Blog
Martin Fowler
Martin Fowler
Latest news
Latest news
Spread Privacy
Spread Privacy
H
Heimdal Security Blog

博客园 - 昕

Egret飞行模拟-开发记录03-LoadingUI界面 Egret飞行模拟-开发记录02 Egret飞行模拟-开发记录01 python学习笔记之五 中岛美雪——疑似穿越人物 tensorflow学习001——MNIST 机器学习笔记之三-yolov3+win7+vs2017+gpu+opencv编译 机器学习笔记之二-win10+cuda9.1+CUDNN7+Anaconda3+VS2017+tensorflow1.5+opencv3.4 机器学习笔记之一 python学习笔记之四-多进程&多线程&异步非阻塞 python网页爬虫开发之六-Selenium使用 python网页爬虫开发之五-反爬 python网页爬虫开发之四-串行爬虫代码示例 python学习笔记之三-计算运行时间 python网页爬虫开发之三 python网页爬虫开发之二 U3D学习14-一阶段学习总结 U3D学习13-数据存储 U3D学习12-黑暗之光实例
python网页爬虫开发之七-多线程爬虫示例01
· 2018-10-25 · via 博客园 - 昕

from urllib.request import quote

import urllib.request

from bs4 import BeautifulSoup

import re

import multiprocessing

import os

import time

def start():

    for txt in range(0, 999):

        start = int(input("请输入开始章节(从1开始):"))-1

        if start < 0 or start > len(chapter_link)-1:

            print("开始章节错误,请重新输入")

        else:

            break

    return start

def end():

    for txt in range(0, 999):

        end = int(input("请输入最后章节(最大为总章节数):"))-1

        if end < 0 or end > len(chapter_link)-1:

            print("结束章节错误,请重新输入")

        else:

            break

    return end

def all():

    filter_chapter_link = r'<a href="(.+?)">.+?</a>'

    book_txt = str(soup.find_all(name="a", attrs={"href": re.compile(r"/\w+/\w+.html")}))

    chapter_link_1 = re.findall(filter_chapter_link, book_txt) # 链接

    chapter_link_2 = "http://www.x23us.us" + " http://www.x23us.us".join(chapter_link_1)

    chapter_link = chapter_link_2.split(' ')

    name=soup.h1.string

    return chapter_link,name

def chapter():

    links=[]

    i=0

    for link_chapter in range(start,end+1):

        links.append(chapter_link[start+i])

        i=i+1

    return links

def mkdir(path):

    floder=os.path.exists(path)

    if not floder:

        os.makedirs(path)

        print("创建成功")

    else:

        print("文件已存在")

def remadir():

    img_path2 = "E:/txt/" + name

    if not os.path.exists(img_path2):

        os.rename(img_path, img_path2)

        print("已全部下载完成!")

    else:

        downloadtime = time.strftime("%Y%m%d%I%M%S", time.localtime())

        os.rename(img_path, img_path2 + downloadtime)

        print("已全部下载完成!\n"+"文件名:" +name + " 已存在,重命名为:" + name+ downloadtime + "\n" + "请勿重复操作")

def download(url):

    req = urllib.request.Request(url) # 请求链接

    req.add_header = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'} # 添加数据头

    page = urllib.request.urlopen(req).read()

    html = page.decode('gbk')

    soup = BeautifulSoup(html, "html.parser")

    book_txt = soup.find_all(name="div", attrs={"id": "content"})

    txt = soup.find_all(name="h1")

    name=re.sub(r'<h1>|</h1>|\|/|<|>|:|\?|\*|"|\|', '',str(txt[0]))

    filter_order = r'http://www.x23us.us/.+?/(.+?).html'

    order = re.findall(filter_order, url)[0]

    book = name+"\n"+(

        re.sub(r'<div id="content" name="content">|</div>|<br/>\n<br/>| |\n', '', str(book_txt[0]))).strip()+"\n\n" # 对过滤和编辑

    f = open("E:/txt/txt/" + order+".txt", "a") # a代表追加模式,不覆盖

    f.write(book.encode('gbk', 'ignore').decode('gbk'))

    f.close()

    print(name+"下载完成")

def change():

    txtname = os.listdir("E:/txt/"+name+"/")

    i = 0

    txts = []

    for txt in range(0, len(txtname)):

        a = "E:/txt/"+name+"/"+txtname[i]

        f1 = open(a, "r")

        lines = f1.readlines()

        o= "\n" + "".join(lines)

        txts.append(o)

        f1.close()

        os.remove(a)

        i = i + 1

    txts1 = "".join(txts)

    g = open("E:/txt/"+name+"/"+name+".txt", "a")

    g.write(txts1)

    g.close()

if __name__ == '__main__':

    for txt in range(0, 999):

        a = quote(input("请输入书名(精确):").encode('GBK')) # 接受一个值,编码为GBK,再转换为url编码 (顶点的编码为GBK,UTF-8和GBK的url编码不一样)

        if len(a) > 1:

            url = "http://www.x23us.us/modules/article/search.php?searchkey=" + a # 搜索的链接

            req = urllib.request.Request(url) # 请求链接

            req.add_header = {

                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'} # 添加数据头

            page = urllib.request.urlopen(req).read() # 打开链接,并读取

            html = page.decode('gbk') # 顶点编码为gbk,把显示的内容转换为gbk编码

            soup = BeautifulSoup(html, "html.parser")

            b = len(soup.find_all(name="div", attrs={"class": "layout"}))

            if b == 0:

                print("搜索成功")

                break

            else:

                print("无结果,请重新输入!")

        else:

            print("请输入至少2个字符长度!")

    chapter_link=all()[0]

    name=all()[1]

    print("一共有" + str(len(chapter_link)) + "章")

    start=start()

    end=end()

    img_path = "E:/txt/txt/"

    mkdir(img_path)

    links = chapter()

    pool = multiprocessing.Pool(processes=10)

    pool.map(download, links)

    pool.close()

    pool.join()

    remadir()

    change()

    time.sleep(5)