简介:Web 爬虫是获取互联网数据的重要工具。本文将总结 Web 爬虫开发中的最佳实践,包括反爬虫应对策略、数据存储优化、并发处理等高级技巧,帮助开发者构建高效、稳定的爬虫系统。
一、爬虫开发基础原则
- 遵守 robots.txt:尊重网站的爬虫协议
- 控制请求频率:避免对目标网站造成压力
- 设置合理的 User-Agent:模拟真实浏览器访问
- 处理异常情况:网络错误、超时、404 等
- 数据验证:确保获取的数据格式正确
二、请求优化
2.1 使用 Session 保持连接
import requests
# 使用 Session 保持连接
session = requests.Session()
# 设置 Session 参数
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# 复用连接
response1 = session.get('https://example.com/page1')
response2 = session.get('https://example.com/page2')
# 关闭 Session
session.close()
2.2 设置超时和重试
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 创建 Session
session = requests.Session()
# 配置重试策略
retry_strategy = Retry(
total=3, # 总重试次数
backoff_factor=1, # 重试间隔因子
status_forcelist=[429, 500, 502, 503, 504], # 需要重试的状态码
allowed_methods=["HEAD", "GET", "OPTIONS"] # 允许重试的方法
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
# 发送请求(设置超时)
try:
response = session.get('https://example.com', timeout=10)
print(response.text)
except requests.exceptions.Timeout:
print("请求超时")
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
2.3 使用连接池
from requests.adapters import HTTPAdapter
# 配置连接池
session = requests.Session()
# 设置连接池大小
adapter = HTTPAdapter(
pool_connections=10, # 连接池大小
pool_maxsize=100, # 最大连接数
max_retries=3
)
session.mount('http://', adapter)
session.mount('https://', adapter)
三、反爬虫应对策略
3.1 User-Agent 轮换
import random
# User-Agent 列表
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15'
]
def get_random_user_agent():
"""获取随机 User-Agent"""
return random.choice(USER_AGENTS)
# 使用示例
headers = {
'User-Agent': get_random_user_agent()
}
response = requests.get('https://example.com', headers=headers)
3.2 IP 代理池
import requests
# 代理池
PROXIES = [
{'http': 'http://proxy1.example.com:8080'},
{'http': 'http://proxy2.example.com:8080'},
{'http': 'http://proxy3.example.com:8080'}
]
def get_random_proxy():
"""获取随机代理"""
return random.choice(PROXIES)
# 使用代理
proxy = get_random_proxy()
response = requests.get('https://example.com', proxies=proxy, timeout=10)
3.3 请求延迟
import time
import random
def random_delay(min_delay=1, max_delay=3):
"""随机延迟"""
delay = random.uniform(min_delay, max_delay)
time.sleep(delay)
# 使用示例
for url in urls:
response = requests.get(url)
# 处理响应
random_delay() # 随机延迟 1-3 秒
3.4 Cookie 处理
import requests
# 使用 Session 自动处理 Cookie
session = requests.Session()
# 首次请求获取 Cookie
login_data = {
'username': 'your_username',
'password': 'your_password'
}
session.post('https://example.com/login', data=login_data)
# 后续请求自动携带 Cookie
response = session.get('https://example.com/protected_page')
# 手动设置 Cookie
cookies = {
'session_id': 'abc123',
'user_token': 'xyz789'
}
response = requests.get('https://example.com', cookies=cookies)
四、数据解析
4.1 使用 BeautifulSoup
from bs4 import BeautifulSoup
import requests
# 获取页面内容
response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
# 查找元素
title = soup.find('h1').text
links = soup.find_all('a', class_='link')
# 提取数据
for link in links:
print(link.get('href'), link.text)
4.2 使用 XPath
from lxml import html
import requests
# 获取页面内容
response = requests.get('https://example.com')
tree = html.fromstring(response.text)
# 使用 XPath 提取数据
title = tree.xpath('//h1/text()')[0]
links = tree.xpath('//a[@class="link"]/@href')
for link in links:
print(link)
4.3 使用正则表达式
import re
# 提取邮箱
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
# 提取 URL
url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
urls = re.findall(url_pattern, text)
# 提取手机号
phone_pattern = r'1[3-9]\d{9}'
phones = re.findall(phone_pattern, text)
五、数据存储
5.1 存储到 CSV
import csv
# 写入 CSV
with open('data.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['标题', '链接', '日期'])
for item in data:
writer.writerow([item['title'], item['url'], item['date']])
5.2 存储到 JSON
import json
# 写入 JSON
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# 读取 JSON
with open('data.json', 'r', encoding='utf-8') as f:
data = json.load(f)
5.3 存储到数据库
import sqlite3
# 创建数据库连接
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
# 创建表
cursor.execute('''
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
url TEXT UNIQUE,
content TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 插入数据
for item in data:
try:
cursor.execute('''
INSERT INTO articles (title, url, content)
VALUES (?, ?, ?)
''', (item['title'], item['url'], item['content']))
except sqlite3.IntegrityError:
print(f"URL 已存在: {item['url']}")
# 提交事务
conn.commit()
conn.close()
六、并发处理
6.1 使用多线程
import threading
import queue
# 创建任务队列
task_queue = queue.Queue()
results = []
# 添加任务
for url in urls:
task_queue.put(url)
# 工作线程函数
def worker():
while not task_queue.empty():
url = task_queue.get()
try:
response = requests.get(url)
results.append(response.text)
except Exception as e:
print(f"Error fetching {url}: {e}")
finally:
task_queue.task_done()
# 创建并启动线程
threads = []
for i in range(5): # 5 个线程
t = threading.Thread(target=worker)
t.start()
threads.append(t)
# 等待所有线程完成
for t in threads:
t.join()
6.2 使用异步请求
import asyncio
import aiohttp
async def fetch(session, url):
"""异步获取页面"""
try:
async with session.get(url) as response:
return await response.text()
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
async def fetch_all(urls):
"""并发获取多个页面"""
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
# 使用示例
urls = ['https://example.com/page1', 'https://example.com/page2']
results = asyncio.run(fetch_all(urls))
七、使用 EasySpider 工具
EasySpider 提供的工具可以辅助爬虫开发:
- Curl 转 Python:快速将浏览器请求转换为 Python 代码
- JSON 格式化:解析和格式化 API 返回的 JSON 数据
- URL 提取:从页面中提取和解析 URL 参数
- 在线加解密:处理加密的请求参数
- 文本对比:对比页面变化
- IP 查询:获取 IP 地理位置信息
开发流程建议:
- 使用浏览器开发者工具分析请求
- 复制 Curl 命令
- 使用 EasySpider 转换为 Python 代码
- 根据需要调整代码
- 测试和优化
八、错误处理和日志
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
filename='spider.log'
)
logger = logging.getLogger(__name__)
def fetch_with_retry(url, max_retries=3):
"""带重试的请求函数"""
for attempt in range(max_retries):
try:
logger.info(f"Fetching {url} (attempt {attempt + 1})")
response = requests.get(url, timeout=10)
response.raise_for_status()
logger.info(f"Successfully fetched {url}")
return response
except requests.exceptions.HTTPError as e:
logger.error(f"HTTP error: {e}")
if e.response.status_code == 404:
logger.warning(f"Page not found: {url}")
return None
except requests.exceptions.Timeout:
logger.error(f"Timeout: {url}")
except requests.exceptions.RequestException as e:
logger.error(f"Request error: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # 指数退避
logger.error(f"Failed to fetch {url} after {max_retries} attempts")
return None
九、监控和维护
- 监控爬虫运行状态
- 记录成功和失败的请求
- 定期检查目标网站结构变化
- 更新反爬虫策略
- 优化性能和资源使用
十、法律和道德
重要提醒:
- 遵守网站的 robots.txt 协议
- 尊重版权和知识产权
- 不要爬取个人隐私信息
- 控制请求频率,避免对服务器造成压力
- 遵守相关法律法规
总结
Web 爬虫开发需要综合考虑技术、性能、法律等多个方面。通过本文的学习,你应该能够:
- 遵循爬虫开发的基本原则
- 优化请求性能
- 应对常见的反爬虫策略
- 高效解析和存储数据
- 实现并发处理
- 使用 EasySpider 工具提高开发效率