import urllib.request,re
def getNovelContent():
html = urllib.request.urlopen('http://www.quanshuwang.com/book/0/742').read()
html = html.decode('gbk')
#print(html)
###.*?是匹配所有的,加括号是我们想要的,放进列表里面###
reg = r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>'
#增加匹配效率
reg = re.compile(reg)
urls = re.findall(reg,html)
for url in urls:
novel_url = url[0]
novel_title = url[1]
chapt = urllib.request.urlopen(novel_url).read()
chapt_html = chapt.decode('gbk')
reg = r' (.*?)<script type="text/javascript">'
reg = re.compile(reg,re.S)
chapt_content = re.findall(reg,chapt_html)
chapt_content = chapt_content[0].replace(" ","")
#print(chapt_content)
chapt_content = chapt_content.replace("<br />", "")
#print(chapt_content)
print('正在保存%s'%novel_title)
f = open('{}.txt'.format(novel_title),'w')
f.write(chapt_content)
f.close()
getNovelContent()