Resolución de Problemas con Conexiones HTTP
Los errores como HttpConnectionPool ocurren por solicitudes frecuentes que agotan el pool de conexiones o llevan al bloqueo de IP. Para mitigar esto, se recomienda usar proxies y configurar el header Connection: close.
Proxies: Tipos y Niveles de Anonimato
Los proxies actúan como intermediarios. Su anonimato varía: alto (no revela datos), medio (detecta el uso del proxy) y bajo (expone la IP real). Los protocolos comunes son HTTP y HTTPS. Algunas fuentes gratuitas incluyen sitios como Goubanjia o 快代理.
Obtención de Cookies desde Navegadores
import os
import sqlite3
import win32crypt
user = os.environ.get('USERNAME')
db_path = f'C:/Users/{user}/AppData/Local/Google/Chrome/User Data/Default/Cookies'
connection = sqlite3.connect(db_path)
cursor = connection.cursor()
query = "SELECT host_key, name, value, encrypted_value FROM cookies WHERE name = 'token' AND host_key = 'example.com';"
try:
if cursor.execute(query):
for row in cursor:
encrypted = row[3]
if encrypted:
decrypted = win32crypt.CryptUnprotectData(encrypted, None, None, None, 0)
value = bytes.decode(decrypted[1])
except Exception as error:
print(error)
Operaciones con Proxies
Para integrar proxies en solicitudes:
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
target_url = "https://www.example.com/check-ip"
response = requests.get(target_url, headers=headers, proxies={"https": "185.199.229.156:7492"})
with open("result.html", "w", encoding="utf-8") as file:
file.write(response.text)
Creación de un Pool de Proxies
import random
import requests
proxy_list = [
{"https": "177.54.143.109:999"},
{"https": "103.149.162.194:80"},
{"https": "50.174.7.165:80"}
]
headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT)'}
url = "https://api.ipify.org?format=json"
chosen_proxy = random.choice(proxy_list)
data = requests.get(url, headers=headers, proxies=chosen_proxy).json()
print("IP detectada:", data['ip'])
Extracción de IPs de Proxy de Fuentes Públicas
import requests
import random
from lxml import html
config_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)',
'Connection': 'close'
}
base_url = "https://www.proxy-list.download/HTTP?page=%d"
proxy_http = []
proxy_https = []
for page in range(1, 5):
full_url = base_url % page
response = requests.get(full_url, headers=config_headers)
tree = html.fromstring(response.content)
rows = tree.xpath('//tr')[1:]
for row in rows:
ip = row.xpath('.//td[1]/text()')[0]
port = row.xpath('.//td[2]/text()')[0]
protocol = row.xpath('.//td[3]/text()')[0].upper()
entry = {protocol: f"{ip}:{port}"}
if protocol == "HTTP":
proxy_http.append(entry)
else:
proxy_https.append(entry)
print(f"Proxies HTTP: {len(proxy_http)}, HTTPS: {len(proxy_https)}")
Manejo de Cookies en Scraping
El uso de Session permite la persistencia automática de cookies:
import requests
from lxml import etree
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0'})
initial_url = "https://www.example.com/login"
session.get(initial_url) # Cookies se almacenan automáticamente
api_url = "https://www.example.com/api/data"
response = session.get(api_url)
print(response.json())
Simulación de Inicio de Sesión con CAPTCHA
import requests
from lxml import etree
import hashlib
class CaptchaSolver:
def __init__(self, user, pwd, software_id):
self.user = user
self.pwd_hash = hashlib.md5(pwd.encode('utf8')).hexdigest()
self.software_id = software_id
def solve(self, image_data, code_type):
payload = {
'user': self.user,
'pass2': self.pwd_hash,
'softid': self.software_id,
'codetype': code_type
}
files = {'userfile': ('captcha.jpg', image_data)}
resp = requests.post('http://api.captchasolver.net/process', data=payload, files=files)
return resp.json().get('text')
solver = CaptchaSolver('user123', 'pass456', '67890')
# Inicio de sesión en sitio con CAPTCHA
session = requests.Session()
login_page = session.get("https://auth.example.com/login")
tree = etree.HTML(login_page.text)
captcha_src = 'https://auth.example.com/' + tree.xpath('//img[@id="captcha"]/@src')[0]
captcha_img = session.get(captcha_src).content
with open("temp_captcha.jpg", "wb") as f:
f.write(captcha_img)
captcha_text = solver.solve(open("temp_captcha.jpg", "rb").read(), 1001)
view_state = tree.xpath('//input[@name="__VIEWSTATE"]/@value')[0]
form_data = {
"__VIEWSTATE": view_state,
"username": "testuser",
"password": "testpass",
"captcha": captcha_text
}
login_response = session.post("https://auth.example.com/submit", data=form_data)
print("Estado de login:", login_response.status_code)
Reconocimeinto de CAPTCHAs
Servicios como Chaojiying o Anti-Captcha ofrecen APIs para resolver CAPTCHAs automáticamente, con diferentes tipos de código soportados.
Parámetros Dinámicos en Solicitudes
A menudo, los parámetros variables (como tokens CSRF) se ocultan en el código fuente HTML y deben extraerse antes de realizar solicitudes POST.
Concurrency con Hilos usando multiprocessing.dummy
import time
import requests
from multiprocessing.dummy import Pool
start_time = time.time()
urls = [
"http://localhost:5000/endpoint1",
"http://localhost:5000/endpoint2",
"http://localhost:5000/endpoint3"
]
def fetch(url):
return requests.get(url).text
with Pool(processes=3) as pool:
results = pool.map(fetch, urls)
print("Tiempo total:", time.time() - start_time)
Servidor Flask para Pruebas
from flask import Flask
import time
app = Flask(__name__)
@app.route('/data1')
def data1():
time.sleep(2)
return "Datos 1"
@app.route('/data2')
def data2():
time.sleep(2)
return "Datos 2"
if __name__ == '__main__':
app.run(threaded=True)
Corrutinas y Tareas con asyncio
import asyncio
async def delayed_print(message):
await asyncio.sleep(1)
print(message)
return message
coroutine = delayed_print("Hello from coroutine")
task = asyncio.ensure_future(coroutine)
task.add_done_callback(lambda t: print("Callback:", t.result()))
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
Multi-tarea Asincrónica
import asyncio
import time
async def process_url(url):
await asyncio.sleep(2)
print(f"Procesado: {url}")
urls = ["url1.com", "url2.com"]
tasks = [asyncio.ensure_future(process_url(url)) for url in urls]
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print("Duración:", time.time() - start)
Scraping Asincrónico con aiohttp
import aiohttp
import asyncio
import time
async def fetch_async(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
urls = [
"http://localhost:5000/data1",
"http://localhost:5000/data2"
]
start = time.time()
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(fetch_async(url)) for url in urls]
responses = loop.run_until_complete(asyncio.gather(*tasks))
for resp in responses:
print("Respuesta:", resp[:100]) # Primeros 100 caracteres
print("Tiempo total:", time.time() - start)