第一版,爬取缩略图
python
import os
import requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
fk = os.path.join(BASE_DIR, '4k')
if not os.path.isdir(fk):
os.mkdir(fk)
url = "https://pic.netbian.com/4kmeinv/"
response = requests.get(url=url)
bs = BeautifulSoup(response.text, "html.parser") # bs用来解析html页面用的
ul = bs.find(name='ul', attrs={"class": "clearfix"})
img_list = ul.find_all(name='img')
for img in img_list:
img_url = "https://pic.netbian.com/" + img.get("src")
file_path = os.path.join(fk, img_url.rsplit('/', 1)[-1])
with open(file_path, 'wb') as f:
img_response = requests.get(url=img_url)
f.write(img_response.content)
print(img_url, 'download done .....')
这一版爬取的是缩略图,效果不好,但能通过该页面了解该网站的相关规则。
第二版,爬取大图
python
import os
import requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
fk = os.path.join(BASE_DIR, '4k')
if not os.path.isdir(fk):
os.mkdir(fk)
url = "https://pic.netbian.com/4kmeinv/"
response = requests.get(url=url)
bs = BeautifulSoup(response.text, "html.parser") # bs用来解析html页面用的
ul = bs.find(name='ul', attrs={"class": "clearfix"})
a_list = ul.find_all(name='a')
for a in a_list:
a_url = "https://pic.netbian.com/" + a.get("href")
a_response = requests.get(url=a_url)
a_bs = BeautifulSoup(a_response.text, 'html.parser')
img_url = "https://pic.netbian.com/" + a_bs.find(name="a", attrs={"id": "img"}).find('img').get('src')
file_path = os.path.join(fk, img_url.rsplit('/', 1)[-1])
with open(file_path, 'wb') as f:
img_response = requests.get(url=img_url)
f.write(img_response.content)
print(img_url, 'download done .....')
第三版,爬取多页
python
import os
import requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
fk = os.path.join(BASE_DIR, '4k')
if not os.path.isdir(fk):
os.mkdir(fk)
for index in range(1, 11): # 爬取前10页
url = "https://pic.netbian.com/4kmeinv/index.html" if index == 1 else "https://pic.netbian.com/4kmeinv/index_{}.html".format(index)
response = requests.get(url=url)
bs = BeautifulSoup(response.text, "html.parser") # bs用来解析html页面用的
ul = bs.find(name='ul', attrs={"class": "clearfix"})
a_list = ul.find_all(name='a')
for a in a_list:
a_url = "https://pic.netbian.com/" + a.get("href")
a_response = requests.get(url=a_url)
a_bs = BeautifulSoup(a_response.text, 'html.parser')
img_url = "https://pic.netbian.com/" + a_bs.find(name="a", attrs={"id": "img"}).find('img').get('src')
file_path = os.path.join(fk, img_url.rsplit('/', 1)[-1])
with open(file_path, 'wb') as f:
img_response = requests.get(url=img_url)
f.write(img_response.content)
print(img_url, 'download done .....')
print('第{}页爬取完毕...'.format(index))
以上代码截止到2021.6.22日,运行无误