
2023-07-13 16:09:49 浏览数 (1)





1. 导入必要的库和模块


import pickle
import logging
from datetime import datetime
from dateutil.parser import parse as parse_date
from brisque import BRISQUE
import os
import cv2
import numpy as np
from PIL import Image
from io import BytesIO
import os
import requests
from skimage import color
from time import sleep
from random import choice
import concurrent.futures
from requests.exceptions import Timeout
from robots import RobotParser

from headers import HEADERS

MAX_RETRIES = 3         # Number of times the crawler should retry a URL
INITIAL_BACKOFF = 2     # Initial backoff delay in seconds
DEFAULT_SLEEP = 10      # Default sleep time in seconds after a 429 error

brisque = BRISQUE(url=False)

2. 设置日志记录器


# --- SETUP LOGGER ---

filename = 'image-scraper.log'
filepath = os.path.dirname(os.path.abspath(__file__))

# create file path for log file
log_file = os.path.join(filepath, filename)

# create a FileHandler to log messages to the log file
handler = logging.FileHandler(log_file)
# set the log message formats
        '%(levelname)s %(threadName)s (%(asctime)s): %(message)s')
# create a logger with the given name and log level
logger = logging.getLogger('image-scraper')
# prevent logging from being send to the upper logger - that includes the console logging
logger.propagate = False
# add the FileHandler to the logger

3. 定义计算图片质量指标的函数


  • 计算亮度:我们将图片转换为灰度图,并计算其像素值的平均值。
  • 计算清晰度:我们使用拉普拉斯算子对灰度图进行边缘检测,并计算其方差值。
  • 计算对比度:我们使用均方根对比度的公式,计算灰度图像素值与其平均值的差的平方的平均值的平方根。
  • 计算噪声:我们使用高斯滤波或中值绝对偏差(MAD)的方法,计算图片的方差值。
  • 计算饱和度:我们将图片转换为HSV颜色空间,并计算其饱和度通道的平均值。
  • 计算色彩度:我们将图片转换为LAB颜色空间,并计算其a和b通道的平方和的平方根的平均值。
  • 获取图片的尺寸:我们获取图片的高度和宽度,并将其添加到字典中。
def get_image_quality_metrics(response):
    Calculate various image quality metrics for an image.

        response (requests.Response): The response object containing the image data.

        dict: A dict of image quality metrics including brightness, sharpness, contrast, and colorfulness.
    image_array = np.frombuffer(response.content, np.uint8)
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)

    metrics = dict()

    # Calculate brightness
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    metrics['brightness'] = np.mean(gray)

    # Calculate sharpness using variance of Laplacian
    metrics['sharpness'] = cv2.Laplacian(gray, cv2.CV_64F).var()

    # Calculate contrast using root mean squared contrast
    metrics['contrast'] = np.sqrt(np.mean((gray - np.mean(gray)) ** 2))

    # Calculate image noise using variance of Gaussian or median absolute deviation (MAD)
    metrics['noise'] = np.var(image)

    # Calculate saturation using average saturation of pixels or histogram analysis
    hsv = color.rgb2hsv(image)
    saturation = hsv[:, :, 1]
    metrics['saturation'] = np.mean(saturation)

    # Calculate colorfulness
    lab = color.rgb2lab(image)
    a, b = lab[:, :, 1], lab[:, :, 2]
    metrics['colorfulness'] = np.sqrt(np.mean(a ** 2   b ** 2))

    # Get dimenstions of the image
    height, width, _ = image.shape
    metrics['height'] = height
    metrics['width'] = width

    return metrics

4. 定义发送请求的函数


  • 我们使用requests库提供的方法来创建一个代理服务器对象,使用亿牛云提供的代理服务器信息。
  • 我们使用一个while循环来重试请求,设置一个最大重试次数和一个初始退避延迟时间。
  • 我们从headers模块中随机选择一个请求头部,并将其添加到请求中。
  • 我们使用try-except语句来捕获可能出现的异常和错误,并根据不同的情况进行处理:
    • 如果出现超时错误,我们记录日志信息,并增加重试次数和退避延迟时间。
    • 如果出现状态码不为200的错误,我们记录日志信息,并根据状态码进行处理:
      • 如果状态码为429,表示请求过于频繁,我们需要等待一段时间后再重试,我们可以使用time模块提供的sleep方法来暂停程序运行,并设置一个默认的睡眠时间。
      • 如果状态码为403或404,表示请求被拒绝或资源不存在,我们可以直接跳出
      • 如果状态码为其他值,表示请求出现其他错误,我们可以直接抛出异常,并记录日志信息。
    • 如果没有出现异常或错误,我们返回响应对象,并记录日志信息。
def send_request(url: str) -> requests.Response:
    Sends a GET request to the specified URL, checks whether the link is valid,
    and returns a response object.

        url (str): The URL to send the GET request to
    retry_count = 0
    backoff = INITIAL_BACKOFF
    header = choice(HEADERS)

    # 亿牛云 爬虫代理加强版
    proxyHost = "www.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "16YUN"
    proxyPass = "16IP"

    # create a proxy server object using the proxy information    
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,

    while retry_count < MAX_RETRIES:
            # Send a GET request to the website and return the response object
            req = requests.get(url, headers=header, proxies=proxies, timeout=20)
            logger.info(f"Successfully fetched {url}")
            return req

        except Timeout:
            # Handle timeout error: log the error and increase the retry count and backoff delay
            logger.error(f"Timeout error for {url}")
            retry_count  = 1
            backoff *= 2

        except requests.exceptions.HTTPError as e:
            # Handle HTTP error: log the error and check the status code
            logger.error(f"HTTP error for {url}: {e}")
            status_code = e.response.status_code

            if status_code == 429:
                # Handle 429 error: wait for some time and retry
                logger.info(f"Waiting for {DEFAULT_SLEEP} seconds after 429 error")
                retry_count  = 1

            elif status_code == 403 or status_code == 404:
                # Handle 403 or 404 error: break the loop and return None
                logger.info(f"Skipping {url} due to {status_code} error")

                # Handle other errors: raise the exception and log the error
                logger.error(f"Other HTTP error for {url}: {e}")
                raise e

    # Return None if the loop ends without returning a response object
    return None

5. 定义处理图片的函数


  • 我们使用PIL库提供的方法来打开响应对象中的图片数据,并将其转换为RGBA格式。
  • 我们使用os模块提供的方法来创建一个名为“images”的文件夹,用于存储下载的图片。
  • 我们使用datetime模块提供的方法来获取当前的日期和时间,并将其转换为字符串格式,作为图片的文件名。
  • 我们使用“with”语句来打开一个以日期和时间命名的文件,并将图片数据写入到文件中。
  • 我们使用brisque模块提供的方法来计算图片的BRISQUE分数,并将其添加到字典中。
  • 我们使用前面定义的get_image_quality_metrics函数来计算图片的其他质量指标,并将其添加到字典中。
  • 我们使用“del”语句来删除不再需要的变量,如响应对象、图片对象等。
  • 我们返回包含图片信息的字典。
def process_image(response, url):
    Process an image from a response object and calculate its quality metrics and BRISQUE score.

        response (requests.Response): The response object containing the image data.
        url (str): The URL of the image.

        dict: A dict of image information including quality metrics and BRISQUE score.
    # Open the image data from the response object and convert it to RGBA format
    image = Image.open(BytesIO(response.content)).convert('RGBA')

    # Create a folder named "images" to store the downloaded images
    os.makedirs('images', exist_ok=True)

    # Get the current date and time and convert it to a string format as the image file name
    date_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    # Open a file with the date and time as the file name and write the image data to it
    with open(f'images/{date_time}.png', 'wb') as f:
        image.save(f, 'PNG')

    # Calculate the BRISQUE score of the image and add it to the dict
    image_info = dict()
    image_info['brisque'] = get_brisque_score(response)

    # Calculate the other quality metrics of the image and add them to the dict

    # Delete the response object and the image object to free up memory
    del response
    del image

    # Return the dict of image information
    return image_info

6. 使用线程池来处理多个网站的图片抓取任务


  • 我们创建一个名为“websites”的列表,用于存储需要抓取图片的网站的URL。
  • 我们创建一个名为“results”的列表,用于存储每个网站的图片抓取结果。
  • 我们使用“with”语句来创建一个线程池对象,并设置其最大线程数为10。
  • 我们遍历每个网站的URL,并使用submit方法来提交一个图片抓取任务,传入send_request函数和URL作为参数,并将返回的future对象添加到results列表中。
  • 我们遍历results列表中的每个future对象,并使用result方法来获取其结果,即响应对象。
  • 我们判断响应对象是否为None,如果不为None,表示请求成功,我们则使用process_image函数来处理响应对象,并将返回的图片信息字典添加到results列表中;如果为None,表示请求失败,我们则跳过该网站。
  • 我们使用pickle模块提供的方法来将results列表序列化并保存到一个名为“results.pkl”的文件中。
# Create a list of websites to scrape images from
websites = [

# Create a list to store the results of each website
results = []

# Create a thread pool with 10 threads and submit tasks for each website
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for website in websites:
        # Submit a task to send a request to the website and get a response object
        future = executor.submit(send_request, website)
        # Add the future object to the results list

# Iterate over the results list and get the result of each future object
for future in results:
    # Get the response object from the future object
    response = future.result()
    # Check if the response object is None or not
    if response is not None:
        # Process the response object and get the image information dict
        image_info = process_image(response, website)
        # Add the image information dict to the results list
        # Skip the website if the response object is None

# Serialize and save the results list to a file using pickle module
with open('results.pkl', 'wb') as f:
    pickle.dump(results, f)



0 人点赞