Python-爬虫小计

发布时间：2019-05-06 22:18:13编辑：auto阅读（2538）

# -*-coding:utf8-*-
import requests
from bs4 import BeautifulSoup
import time
import os
import urllib
import re
import json


requests.packages.urllib3.disable_warnings()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
proxies = {"http": "**********************",
           "https": "********************8"}
def get_bs(url):
    res = requests.get(url, proxies=proxies,headers=headers,verify=False)
    bs = BeautifulSoup(res.content, 'lxml')
    return bs

def get_first_url():
    first_url_list = []
    page = 1
    for i in range(page):
        root_url =  "https://www.model61.com/mold.php?page={}".format(str(i+1))
        bs = get_bs(root_url)
        for i in  bs.select("dt a"):
            src = i.get('href')
            if "php" in src:
                first_url = "https://www.model61.com/{}".format(src)
                first_url_list.append(first_url)
    return first_url_list

def get_second_url(first_url):
    data = {}
    bs = get_bs(first_url)
    for i in bs.select(".cont-top a"):
        src = i.get('href')
        if "album_s" in src:
            second_url = "https://www.model61.com/{}".format(src)
            #print("second_url",second_url)
            data["second_url"] = second_url

    for j in bs.select(".content_center_date"):
        data["identity"] = j.get_text()
    return data


def get_thred_url(second_url):
    bs = get_bs(second_url)
    for i in  bs.select("dt a"):
        src = i.get('href')
        if "album_list" in src:
            thred_url = "https://www.model61.com/{}".format(src)
            #print("thred_url", thred_url)
            return thred_url


def get_image_list(thred_url):
    image_list = []
    bs = get_bs(thred_url)
    for i in bs.select(".album_list_left a")+bs.select(".album_list_right a"):
        src = i.get('href')
        image_path = "https://www.model61.com/{}".format(src)
        image_list.append(image_path)
        #print("image_path",image_path)
    return image_list

def download_image(image_path,image_url):
    try:
        r = requests.get(image_url, proxies=proxies, headers=headers, verify=False, allow_redirects=False)
        with open(image_path, 'wb') as f:
            f.write(r.content)
    except Exception as e:
        print(e)

def create_face_id(data):
    save_path = r""
    identity = data["identity"]
    ld_list = identity.split("\n")
    identity = ld_list[1] + '_' + ld_list[3][4:] + "_" + ld_list[7][6:] + '_' + ld_list[8][4:]
    print(identity)
    identity_path = os.path.join(save_path, identity)
    if not os.path.exists(identity_path):
        os.mkdir(identity_path)
    for image_url in data['image_list']:
        image_path = os.path.join(identity_path, '{}.jpg'.format(str(int(time.time() * 1000))))
        download_image(image_path, image_url)


if __name__ == '__main__':

    first_url_list = get_first_url()
    for first_url in first_url_list:
        try:
            data = get_second_url(first_url)
            print(data)
            second_url = data['second_url']
            thred_url = get_thred_url(second_url)
            image_list = get_image_list(thred_url)
            data["image_list"] = image_list
            create_face_id(data)
        except Exception as e:
            print(first_url,e)

关键字：

上一篇： python 字典

下一篇： centos7下安装Python3.7（



搜索

热门推荐

最新文章

Python搭建一个RAG系统(分片/检索/召回/重排序/生成)
 490°
Browser-use:智能浏览器自动化(Web-Agent)
 1104°
使用 LangChain 实现本地 Agent
 902°
使用 LangChain 构建本地 RAG 应用
 837°
使用LLaMA-Factory微调大模型的function calling能力
 1083°
复现一个简单Agent系统
 959°
LLaMA Factory-Lora微调实现声控语音多轮问答对话-1
 1572°
LLaMA Factory微调后的模型合并导出和部署-4
 2940°
LLaMA Factory微调模型的各种参数怎么设置-3
 2621°
LLaMA Factory构建高质量数据集-2
 2014°

博主信息

姓名：Run
职业：谜
邮箱：383697894@qq.com
定位：上海 · 松江

扫我打开

友情链接

百度 淘宝 腾讯 慕课网 CSDN 博客园 51cto博客