校花网
校花网官网
pip install requests
pip install BeautifulSopu4
爬取单页
python
import os
import requests
from bs4 import BeautifulSoup
base_dir = os.path.dirname(os.path.abspath(__file__))
def spider():
response = requests.get(url='https://nice.ruyile.com/?f=2')
soup = BeautifulSoup(response.text, 'html.parser')
all_content = soup.find(name='div', attrs={'class': 'm3_xhtp'}) # 拿到所有图片标签外部的div
tag_list = all_content.find_all(name='div', attrs={'class': 'tp_list'})
for item in tag_list:
# 获取每个妹子的具体链接
res = item.find_all(name='a')[1]
a_content_file_path = res.text
a_url = response.url.split('/?')[0] + res.get('href')
# print(a_url, a_content_file_path) # https://nice.ruyile.com/r16604/ 清纯大眼睛MM
# 进入每个妹子的详情页
girl_details = requests.get(url=a_url)
girl_soup = BeautifulSoup(girl_details.text, 'html.parser')
img_all_div = girl_soup.find_all(name='div', attrs={'class': 'm6_js'})[1]
img_list = img_all_div.find_all(name='p')
os.makedirs(os.path.join(base_dir, 'img_list', a_content_file_path))
file_path = os.path.join(base_dir, 'img_list', a_content_file_path)
for i in img_list:
img_src = i.find(name='img').get('src')
img_content = requests.get(url=img_src)
with open(os.path.join(file_path, img_src.rsplit('/')[-1]), 'wb') as f:
f.write(img_content.content)
if __name__ == '__main__':
spider()
7160图片大全
python
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
BASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)))
def spider(line):
response = requests.get(url='http://www.7160.com/xiaohua/list_6_{}.html'.format(line))
# print(response.encoding)
response.encoding = 'GBK'
soup = BeautifulSoup(response.text, 'html.parser')
div = soup.find(name='div', attrs={'class': "news_bom-left"})
for li in div.find_all(name='li'):
a_url = li.find('img').get('src')
print(response.url, a_url)
path = os.path.join(BASE_PATH, 'a', a_url.rsplit('/', 1)[-1])
with open(path, mode='wb') as f:
res = requests.get(a_url)
f.write(res.content)
def run():
t = ThreadPoolExecutor(10)
for i in range(1, 11):
t.submit(spider, i)
if __name__ == '__main__':
run()
天极网
python
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
BASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)))
def worker(a_url, title):
response = requests.get(a_url)
# print(response.encoding)
response.encoding = 'GBK'
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
div = soup.find(name='div', attrs={'class': "overview"})
# print(div)
for item in div.find_all(name='img'):
src = item.get('src')
print(a_url, src)
path = os.path.join(BASE_PATH, 'b', title, src.rsplit('/', 1)[-1])
with open(path, 'wb') as f:
res = requests.get(src.replace('113x113', '740x-'))
f.write(res.content)
def spider(line):
response = requests.get(url='http://pic.yesky.com/c/6_243_{}.shtml'.format(line))
soup = BeautifulSoup(response.text, 'html.parser')
div = soup.find(name='div', attrs={'class': "lb_box"})
# print(div)
for dd in div.find_all(name='dd'):
a_url, title = dd.find('a').get("href"), dd.find('a').get("title")
path = os.path.join(BASE_PATH, 'b', title)
if not os.path.isdir(os.path.join(BASE_PATH, 'b', title)):
os.mkdir(path)
worker(a_url, title)
# break
def run():
t = ThreadPoolExecutor(10)
for i in range(1, 11):
t.submit(spider, i)
if __name__ == '__main__':
run()