webbrowser 模块的 open()函数可以启动一个新浏览器,打开指定的 URL
>>> import webbrowser
>>> webbrowser.open('www.baidu.com')
从命令行或剪贴板中获取地址信息,然后打开浏览器百度地图,自动加载地址
import sys,webbrowser,pyperclip
# 启动浏览器并打开baidu.com
webbrowser.open("www.baidu.com")
if len(sys.argv)>1:
params=sys.argv[1:] # 获取命令行参数
address=' '.join(params) # 将列表用' '连接成字符串
else:
address=pyperclip.paste() # 从粘贴板中获取参数
webbrowser.open(r'http://map.baidu.com/?newmap=1&s=s%26wd%3D'+address+'%26c%3D28&from=alamap&tpl=mapcity')
用 requests 模块从 Web 下载文件
import requests
res=requests.get('https://.../xiaoshuo/44/44376/11931132.html')
print(type(res))
print(res.status_code==requests.codes.ok) # 请求状态
print(len(res.text))
res.encoding='gbk' # res会自动识别编码,若识别识别乱码,则手动设置编码。上面请求的页面编码为 gbk。
print(res.text[:100])
raise_for_status()方法:如果下载文件出错,将抛出异常。如果下载成功,就什么也不做。
res=requests.get('https://abc/xiaoshuo/44/44376/11931132a.html') # 此路径不存在
try:
res.raise_for_status()
except:
print('next!!!')
print('不try则抛出异常不再运行')
next!!!
不try则抛出异常不再运行
下载文件并保存在硬盘
import requests,os
res=requests.get('https://.../xiaoshuo/44/44376/11931132.html') # 下载html页面
res.raise_for_status()
file=open(os.path.join('E:\\','path','web.txt'),'wb') # 重命名为web.txt
for chunk in res.iter_content(100000): # 每次写入100000字节
file.write(chunk)
file.close()
用 BeautifulSoup 模块解析 HTML
import requests,bs4,os
# 解析链接
url='https://.../xiaoshuo/44/44376/11931132.html'
res=requests.get(url)
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text,"lxml") # 指定解析器
print(type(soup))
# 解析本地文件
file=open(os.path.join('E:\\','path','web.html'))
soup=bs4.BeautifulSoup(file,"lxml")
print(type(soup))
ele=soup.select('.novel h1')
print(len(ele))
title=ele[0].get_text() # 获取元素间的文本内容
print(title)
content=soup.select('.novel .yd_text2')[0].getText()
attrs=soup.select('.novel .yd_text2')[0].attrs # 获取元素的属性列表
print(attrs)
print(content)
parts=soup.select('.novel .pereview a')
print(len(parts))
for i in range(0,len(parts)):
href=parts[i].get('href') # 获取元素的href属性值
print(href,end='---')
target=parts[i].get('target') # 获取元素的target属性值
print(target,end='---')
hclass=parts[i].get('class') # 获取元素的class属性值
print(hclass)
print(parts[i].getText()+'---'+href+'---'+target+'---'+str(hclass))
打开页面的所有的链接
import webbrowser,requests,bs4
url='https://.../xiaoshuo/44/44376/'
res=requests.get(url)
res.raise_for_status()
print(res.encoding) # ISO-8859-1 (原网页使用的是gbk)
res.encoding='gbk' # 将内容编码修改为页面编码
soup=bs4.BeautifulSoup(res.text,"lxml")
eles=soup.select('.mulu ul li a')
print(soup.original_encoding) # bs自动检测为 None
soup.prettify('gbk') # bs 默认输入的编码是 utf-8,修改为页面编码输出
rmax=len(eles)
if rmax>10: # 小说章节太多,同时打开10章测试
rmax=10
for i in range(0,rmax):
eles[i].encode('gbk') # 每个元素都可以设定编码(上面已统一设定编码,此处可以不设置)
href=eles[i].get('href')
path=url+href
print('open '+eles[i].getText()+' '+href) # 若不在上面指定编码,输出会乱码
webbrowser.open(path) # 打开页面
# 下面针对乱码处理
# 参考资料:
# 1.requests
# When you make a request, Requests makes educated guesses about the encoding of the response based on the HTTP headers.
# The text encoding guessed by Requests is used when you access r.text. You can find out what encoding Requests is using,
# and change it, using the r.encoding property.
# 2.bs4
# Beautiful Soup uses a sub-library called Unicode, Dammit to detect a document’s encoding and convert it to Unicode.
# The autodetected encoding is available as the .original_encoding attribute of the BeautifulSoup object.Unicode,
# Dammit guesses correctly most of the time, but sometimes it makes mistakes. Sometimes it guesses correctly,
# but only after a byte-by-byte search of the document that takes a very long time. If you happen to know a document’s encoding ahead of time,
# you can avoid mistakes and delays by passing it to the BeautifulSoup constructor as from_encoding.
# 参数错误:UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.
# warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
# 通过Beautiful Soup输出文档时,不管输入文档是什么编码方式,输出编码均为UTF-8编码,如果不想用UTF-8编码输出,可以将编码方式传入 prettify() 方法.
下载漫画保存在硬盘内
格式:漫画名(新建文件夹)\章节名(新建文件夹)\漫画.jpg
# 下载漫画
import os,bs4,requests,logging
from urllib.parse import urlparse # url解析
from urllib.parse import urljoin # url地址处理
logging.basicConfig(level=logging.DEBUG) # 日志管理
# 下载图片方法
def downloadJpg(view_url,capter):
logging.debug('下载页面:%s'%view_url)
res=requests.get(view_url,timeout=request_time_out)
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text,'lxml')
img_eles=soup.select('#main .comic-imgs img')
for i in range(0,len(img_eles)):
img_src=img_eles[i].get('data-kksrc') # 漫画jpg真实地址需处理
src=str(img_src).replace('amp;','')
logging.debug('漫画图片地址:%s'%src)
img = requests.get(src,timeout=request_time_out) # 下载图片
img_file = open(os.path.join(capter, str(i+1) + '.jpg'), 'wb')
logging.debug('漫画:%s' % (os.path.join(capter, str(i+1) + '.jpg')))
for chunk in img.iter_content(100000):
img_file.write(chunk)
img_file.close()
path='http://www.kuaikanmanhua.com/web/topic/1745/'
request_time_out=5.0 # 设置请求超时时长5s
res=requests.get(path,timeout=request_time_out)
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text,'lxml')
capters=soup.select('.article-list .table .tit a')
for i in range(0,len(capters)):
href=capters[i].get('href')
title=capters[i].get('title')
capter=os.path.join('E:\\', 'path', '元尊', title) # 每一章节创建一个文件夹
os.makedirs(capter,exist_ok=True)
logging.debug('创建文件夹:%s'%title)
downloadJpg(urljoin(path,'../../../%s'%href),capter) # urljoin处理得到正确页面,urljoin(path,'../../../%s'%href)相当于 http://www.kuaikanmanhua.com/href