简介:本文将详细介绍如何从电商平台采集商品数据,包括商品标题、价格、销量、评价等信息。我们将使用 Python + Selenium 的组合,应对电商平台的反爬虫机制,实现稳定高效的数据采集。
一、需求分析
采集目标:
- 商品基本信息(标题、价格、品牌)
- 商品详情(规格、库存、销量)
- 用户评价(评分、评论数、评论内容)
- 商品图片(主图、详情图)
- 店铺信息(店铺名称、信誉度)
技术挑战:
- 动态加载内容(Ajax 请求)
- 登录验证(Cookie/Session)
- 反爬虫机制(频率限制、验证码)
- 数据量大(需要分页处理)
- 页面结构变化(需要定期维护)
二、准备工作
2.1 环境搭建
# 安装依赖包
pip install selenium
pip install webdriver-manager
pip install beautifulsoup4
pip install pandas
pip install requests
2.2 配置 WebDriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920,1080')
# 设置 User-Agent
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
# 初始化浏览器
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
三、核心代码实现
3.1 商品列表页采集
def get_product_list(keyword, pages=5):
"""获取商品列表"""
products = []
for page in range(1, pages + 1):
url = f"https://search.example.com/search?q={keyword}&page={page}"
driver.get(url)
time.sleep(2) # 等待页面加载
# 等待商品元素加载
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "product-item"))
)
# 提取商品信息
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all('div', class_='product-item')
for item in items:
product = {
'title': item.find('h3', class_='title').text.strip(),
'price': item.find('span', class_='price').text.strip(),
'url': item.find('a')['href'],
'sales': item.find('span', class_='sales').text.strip(),
'rating': item.find('span', class_='rating')['data-score']
}
products.append(product)
print(f"已采集: {product['title']}")
print(f"第 {page} 页采集完成,共 {len(items)} 个商品")
return products
3.2 商品详情页采集
def get_product_detail(product_url):
"""获取商品详情"""
driver.get(product_url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# 提取详细信息
detail = {
'title': soup.find('h1', class_='product-title').text.strip(),
'price': soup.find('span', class_='current-price').text.strip(),
'original_price': soup.find('span', class_='original-price').text.strip() if soup.find('span', class_='original-price') else '',
'brand': soup.find('a', class_='brand-name').text.strip() if soup.find('a', class_='brand-name') else '',
'description': soup.find('div', class_='description').text.strip(),
'stock': soup.find('span', class_='stock-info').text.strip(),
'total_sales': soup.find('span', class_='total-sales').text.strip(),
'comments_count': soup.find('span', class_='comments-count').text.strip(),
'images': [img['src'] for img in soup.find_all('img', class_='product-image')],
'specs': {}
}
# 提取规格信息
spec_groups = soup.find_all('div', class_='spec-group')
for group in spec_groups:
group_name = group.find('span', class_='spec-name').text.strip()
spec_values = [val.text.strip() for val in group.find_all('span', class_='spec-value')]
detail['specs'][group_name] = spec_values
return detail
3.3 用户评价采集
def get_product_comments(product_id, max_pages=5):
"""获取商品评价"""
comments = []
for page in range(1, max_pages + 1):
comment_url = f"https://comments.example.com/api/comments?productId={product_id}&page={page}"
driver.get(comment_url)
time.sleep(2)
# 解析 JSON 响应
response = driver.find_element(By.TAG_NAME, 'pre').text
data = json.loads(response)
if 'comments' in data:
for item in data['comments']:
comment = {
'user': item['user']['name'],
'rating': item['rating'],
'content': item['content'],
'date': item['createTime'],
'images': item.get('images', [])
}
comments.append(comment)
print(f"第 {page} 页评价采集完成,共 {len(comments)} 条")
return comments
四、反爬虫应对策略
4.1 登录保持
def login(username, password):
"""模拟登录"""
driver.get("https://login.example.com")
# 输入用户名和密码
driver.find_element(By.ID, "username").send_keys(username)
driver.find_element(By.ID, "password").send_keys(password)
# 点击登录按钮
driver.find_element(By.ID, "login-btn").click()
# 等待登录完成
WebDriverWait(driver, 10).until(
EC.url_contains("home")
)
# 保存 Cookie
cookies = driver.get_cookies()
with open('cookies.json', 'w') as f:
json.dump(cookies, f)
print("登录成功")
def load_cookies():
"""加载保存的 Cookie"""
driver.get("https://example.com")
with open('cookies.json', 'r') as f:
cookies = json.load(f)
for cookie in cookies:
driver.add_cookie(cookie)
4.2 随机延迟
import random
import time
def random_delay(min_sec=1, max_sec=3):
"""随机延迟"""
delay = random.uniform(min_sec, max_sec)
time.sleep(delay)
# 在每次请求前调用
def safe_request(url):
random_delay() # 随机延迟
driver.get(url)
4.3 User-Agent 轮换
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
def get_random_user_agent():
return random.choice(USER_AGENTS)
# 更新浏览器配置
chrome_options.add_argument(f'user-agent={get_random_user_agent()}')
五、数据存储
5.1 存储到 Excel
import pandas as pd
def save_to_excel(data, filename='products.xlsx'):
"""保存数据到 Excel"""
df = pd.DataFrame(data)
df.to_excel(filename, index=False, engine='openpyxl')
print(f"数据已保存到 {filename}")
# 使用示例
products = get_product_list("手机", pages=3)
save_to_excel(products)
5.2 存储到数据库
import sqlite3
def save_to_database(data, db_file='products.db'):
"""保存数据到数据库"""
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
# 创建表
cursor.execute('''
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
price TEXT,
url TEXT UNIQUE,
sales TEXT,
rating TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 插入数据
for item in data:
try:
cursor.execute('''
INSERT INTO products (title, price, url, sales, rating)
VALUES (?, ?, ?, ?, ?)
''', (item['title'], item['price'], item['url'], item['sales'], item['rating']))
except sqlite3.IntegrityError:
print(f"URL 已存在: {item['url']}")
conn.commit()
conn.close()
print("数据已保存到数据库")
六、完整流程整合
def main():
"""主函数"""
keyword = "无线耳机"
pages = 5
print(f"开始采集商品: {keyword}")
# 1. 采集商品列表
products = get_product_list(keyword, pages)
print(f"共采集 {len(products)} 个商品")
# 2. 采集商品详情
detailed_products = []
for i, product in enumerate(products):
print(f"正在采集第 {i+1}/{len(products)} 个商品详情")
try:
detail = get_product_detail(product['url'])
detail.update(product)
detailed_products.append(detail)
random_delay(2, 4) # 随机延迟
except Exception as e:
print(f"采集失败: {e}")
# 3. 保存数据
save_to_excel(detailed_products, f"{keyword}_products.xlsx")
save_to_database(detailed_products)
# 4. 关闭浏览器
driver.quit()
print("采集完成!")
if __name__ == '__main__':
main()
七、异常处理和优化
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='spider.log'
)
logger = logging.getLogger(__name__)
def safe_request_with_retry(url, max_retries=3):
"""带重试的安全请求"""
for attempt in range(max_retries):
try:
driver.get(url)
random_delay(1, 2)
return True
except Exception as e:
logger.error(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(5) # 等待后重试
logger.error(f"请求失败,已达最大重试次数: {url}")
return False
八、总结
本文完整介绍了电商商品数据爬取的实现过程,包括:
- 需求分析和环境搭建
- 商品列表、详情、评价的采集方法
- 反爬虫应对策略(登录、延迟、UA轮换)
- 数据存储方案(Excel、数据库)
- 异常处理和日志记录
使用 EasySpider 工具:在实际开发中,你可以使用 EasySpider 的 Curl 转 Python 功能快速生成请求代码,使用 JSON 格式化工具分析 API 响应,大大提高开发效率。
重要提醒:爬取电商数据时请注意遵守网站的服务条款,控制采集频率,不要对网站造成过大压力。本文仅供学习参考。