Técnicas Avanzadas de Web Scraping en Python: Manejo de Proxies, Cookies, CAPTCHAs y Concurrency Asincrónica

Resolución de Problemas con Conexiones HTTP

Los errores como HttpConnectionPool ocurren por solicitudes frecuentes que agotan el pool de conexiones o llevan al bloqueo de IP. Para mitigar esto, se recomienda usar proxies y configurar el header Connection: close.

Proxies: Tipos y Niveles de Anonimato

Los proxies actúan como intermediarios. Su anonimato varía: alto (no revela datos), medio (detecta el uso del proxy) y bajo (expone la IP real). Los protocolos comunes son HTTP y HTTPS. Algunas fuentes gratuitas incluyen sitios como Goubanjia o 快代理.

Obtención de Cookies desde Navegadores

import os
import sqlite3
import win32crypt

user = os.environ.get('USERNAME')
db_path = f'C:/Users/{user}/AppData/Local/Google/Chrome/User Data/Default/Cookies'
connection = sqlite3.connect(db_path)
cursor = connection.cursor()
query = "SELECT host_key, name, value, encrypted_value FROM cookies WHERE name = 'token' AND host_key = 'example.com';"
try:
    if cursor.execute(query):
        for row in cursor:
            encrypted = row[3]
            if encrypted:
                decrypted = win32crypt.CryptUnprotectData(encrypted, None, None, None, 0)
                value = bytes.decode(decrypted[1])
except Exception as error:
    print(error)

Operaciones con Proxies

Para integrar proxies en solicitudes:

import requests

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
target_url = "https://www.example.com/check-ip"
response = requests.get(target_url, headers=headers, proxies={"https": "185.199.229.156:7492"})
with open("result.html", "w", encoding="utf-8") as file:
    file.write(response.text)

Creación de un Pool de Proxies

import random
import requests

proxy_list = [
    {"https": "177.54.143.109:999"},
    {"https": "103.149.162.194:80"},
    {"https": "50.174.7.165:80"}
]
headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT)'}
url = "https://api.ipify.org?format=json"
chosen_proxy = random.choice(proxy_list)
data = requests.get(url, headers=headers, proxies=chosen_proxy).json()
print("IP detectada:", data['ip'])

Extracción de IPs de Proxy de Fuentes Públicas

import requests
import random
from lxml import html

config_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)',
    'Connection': 'close'
}
base_url = "https://www.proxy-list.download/HTTP?page=%d"
proxy_http = []
proxy_https = []

for page in range(1, 5):
    full_url = base_url % page
    response = requests.get(full_url, headers=config_headers)
    tree = html.fromstring(response.content)
    rows = tree.xpath('//tr')[1:]
    for row in rows:
        ip = row.xpath('.//td[1]/text()')[0]
        port = row.xpath('.//td[2]/text()')[0]
        protocol = row.xpath('.//td[3]/text()')[0].upper()
        entry = {protocol: f"{ip}:{port}"}
        if protocol == "HTTP":
            proxy_http.append(entry)
        else:
            proxy_https.append(entry)

print(f"Proxies HTTP: {len(proxy_http)}, HTTPS: {len(proxy_https)}")

Manejo de Cookies en Scraping

El uso de Session permite la persistencia automática de cookies:

import requests
from lxml import etree

session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0'})
initial_url = "https://www.example.com/login"
session.get(initial_url)  # Cookies se almacenan automáticamente

api_url = "https://www.example.com/api/data"
response = session.get(api_url)
print(response.json())

Simulación de Inicio de Sesión con CAPTCHA

import requests
from lxml import etree
import hashlib

class CaptchaSolver:
    def __init__(self, user, pwd, software_id):
        self.user = user
        self.pwd_hash = hashlib.md5(pwd.encode('utf8')).hexdigest()
        self.software_id = software_id

    def solve(self, image_data, code_type):
        payload = {
            'user': self.user,
            'pass2': self.pwd_hash,
            'softid': self.software_id,
            'codetype': code_type
        }
        files = {'userfile': ('captcha.jpg', image_data)}
        resp = requests.post('http://api.captchasolver.net/process', data=payload, files=files)
        return resp.json().get('text')

solver = CaptchaSolver('user123', 'pass456', '67890')

# Inicio de sesión en sitio con CAPTCHA
session = requests.Session()
login_page = session.get("https://auth.example.com/login")
tree = etree.HTML(login_page.text)
captcha_src = 'https://auth.example.com/' + tree.xpath('//img[@id="captcha"]/@src')[0]
captcha_img = session.get(captcha_src).content

with open("temp_captcha.jpg", "wb") as f:
    f.write(captcha_img)

captcha_text = solver.solve(open("temp_captcha.jpg", "rb").read(), 1001)

view_state = tree.xpath('//input[@name="__VIEWSTATE"]/@value')[0]
form_data = {
    "__VIEWSTATE": view_state,
    "username": "testuser",
    "password": "testpass",
    "captcha": captcha_text
}
login_response = session.post("https://auth.example.com/submit", data=form_data)
print("Estado de login:", login_response.status_code)

Reconocimeinto de CAPTCHAs

Servicios como Chaojiying o Anti-Captcha ofrecen APIs para resolver CAPTCHAs automáticamente, con diferentes tipos de código soportados.

Parámetros Dinámicos en Solicitudes

A menudo, los parámetros variables (como tokens CSRF) se ocultan en el código fuente HTML y deben extraerse antes de realizar solicitudes POST.

Concurrency con Hilos usando multiprocessing.dummy

import time
import requests
from multiprocessing.dummy import Pool

start_time = time.time()
urls = [
    "http://localhost:5000/endpoint1",
    "http://localhost:5000/endpoint2",
    "http://localhost:5000/endpoint3"
]

def fetch(url):
    return requests.get(url).text

with Pool(processes=3) as pool:
    results = pool.map(fetch, urls)

print("Tiempo total:", time.time() - start_time)

Servidor Flask para Pruebas

from flask import Flask
import time

app = Flask(__name__)

@app.route('/data1')
def data1():
    time.sleep(2)
    return "Datos 1"

@app.route('/data2')
def data2():
    time.sleep(2)
    return "Datos 2"

if __name__ == '__main__':
    app.run(threaded=True)

Corrutinas y Tareas con asyncio

import asyncio

async def delayed_print(message):
    await asyncio.sleep(1)
    print(message)
    return message

coroutine = delayed_print("Hello from coroutine")
task = asyncio.ensure_future(coroutine)
task.add_done_callback(lambda t: print("Callback:", t.result()))

loop = asyncio.get_event_loop()
loop.run_until_complete(task)

Multi-tarea Asincrónica

import asyncio
import time

async def process_url(url):
    await asyncio.sleep(2)
    print(f"Procesado: {url}")

urls = ["url1.com", "url2.com"]
tasks = [asyncio.ensure_future(process_url(url)) for url in urls]

start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print("Duración:", time.time() - start)

Scraping Asincrónico con aiohttp

import aiohttp
import asyncio
import time

async def fetch_async(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.text()

urls = [
    "http://localhost:5000/data1",
    "http://localhost:5000/data2"
]

start = time.time()
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(fetch_async(url)) for url in urls]
responses = loop.run_until_complete(asyncio.gather(*tasks))

for resp in responses:
    print("Respuesta:", resp[:100])  # Primeros 100 caracteres
print("Tiempo total:", time.time() - start)

Etiquetas: Python requests proxies Cookies web-scraping

Publicado el 7-2 22:50