电商商品数据爬取实战

简介：本文将详细介绍如何从电商平台采集商品数据,包括商品标题、价格、销量、评价等信息。我们将使用 Python + Selenium 的组合,应对电商平台的反爬虫机制,实现稳定高效的数据采集。

一、需求分析

采集目标：

商品基本信息（标题、价格、品牌）
商品详情（规格、库存、销量）
用户评价（评分、评论数、评论内容）
商品图片（主图、详情图）
店铺信息（店铺名称、信誉度）

技术挑战：

动态加载内容（Ajax 请求）
登录验证（Cookie/Session）
反爬虫机制（频率限制、验证码）
数据量大（需要分页处理）
页面结构变化（需要定期维护）

二、准备工作

2.1 环境搭建

                        # 安装依赖包
pip install selenium
pip install webdriver-manager
pip install beautifulsoup4
pip install pandas
pip install requests
                    

2.2 配置 WebDriver

                        from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless')  # 无头模式
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920,1080')

# 设置 User-Agent
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

# 初始化浏览器
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
                    

三、核心代码实现

3.1 商品列表页采集

                        def get_product_list(keyword, pages=5):
    """获取商品列表"""
    products = []

    for page in range(1, pages + 1):
        url = f"https://search.example.com/search?q={keyword}&page={page}"
        driver.get(url)
        time.sleep(2)  # 等待页面加载

        # 等待商品元素加载
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "product-item"))
        )

        # 提取商品信息
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        items = soup.find_all('div', class_='product-item')

        for item in items:
            product = {
                'title': item.find('h3', class_='title').text.strip(),
                'price': item.find('span', class_='price').text.strip(),
                'url': item.find('a')['href'],
                'sales': item.find('span', class_='sales').text.strip(),
                'rating': item.find('span', class_='rating')['data-score']
            }
            products.append(product)
            print(f"已采集: {product['title']}")

        print(f"第 {page} 页采集完成，共 {len(items)} 个商品")

    return products
                    

3.2 商品详情页采集

                        def get_product_detail(product_url):
    """获取商品详情"""
    driver.get(product_url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # 提取详细信息
    detail = {
        'title': soup.find('h1', class_='product-title').text.strip(),
        'price': soup.find('span', class_='current-price').text.strip(),
        'original_price': soup.find('span', class_='original-price').text.strip() if soup.find('span', class_='original-price') else '',
        'brand': soup.find('a', class_='brand-name').text.strip() if soup.find('a', class_='brand-name') else '',
        'description': soup.find('div', class_='description').text.strip(),
        'stock': soup.find('span', class_='stock-info').text.strip(),
        'total_sales': soup.find('span', class_='total-sales').text.strip(),
        'comments_count': soup.find('span', class_='comments-count').text.strip(),
        'images': [img['src'] for img in soup.find_all('img', class_='product-image')],
        'specs': {}
    }

    # 提取规格信息
    spec_groups = soup.find_all('div', class_='spec-group')
    for group in spec_groups:
        group_name = group.find('span', class_='spec-name').text.strip()
        spec_values = [val.text.strip() for val in group.find_all('span', class_='spec-value')]
        detail['specs'][group_name] = spec_values

    return detail
                    

3.3 用户评价采集

                        def get_product_comments(product_id, max_pages=5):
    """获取商品评价"""
    comments = []

    for page in range(1, max_pages + 1):
        comment_url = f"https://comments.example.com/api/comments?productId={product_id}&page={page}"
        driver.get(comment_url)
        time.sleep(2)

        # 解析 JSON 响应
        response = driver.find_element(By.TAG_NAME, 'pre').text
        data = json.loads(response)

        if 'comments' in data:
            for item in data['comments']:
                comment = {
                    'user': item['user']['name'],
                    'rating': item['rating'],
                    'content': item['content'],
                    'date': item['createTime'],
                    'images': item.get('images', [])
                }
                comments.append(comment)

        print(f"第 {page} 页评价采集完成，共 {len(comments)} 条")

    return comments
                    

四、反爬虫应对策略

4.1 登录保持

                        def login(username, password):
    """模拟登录"""
    driver.get("https://login.example.com")

    # 输入用户名和密码
    driver.find_element(By.ID, "username").send_keys(username)
    driver.find_element(By.ID, "password").send_keys(password)

    # 点击登录按钮
    driver.find_element(By.ID, "login-btn").click()

    # 等待登录完成
    WebDriverWait(driver, 10).until(
        EC.url_contains("home")
    )

    # 保存 Cookie
    cookies = driver.get_cookies()
    with open('cookies.json', 'w') as f:
        json.dump(cookies, f)

    print("登录成功")

def load_cookies():
    """加载保存的 Cookie"""
    driver.get("https://example.com")
    with open('cookies.json', 'r') as f:
        cookies = json.load(f)
        for cookie in cookies:
            driver.add_cookie(cookie)
                    

4.2 随机延迟

                        import random
import time

def random_delay(min_sec=1, max_sec=3):
    """随机延迟"""
    delay = random.uniform(min_sec, max_sec)
    time.sleep(delay)

# 在每次请求前调用
def safe_request(url):
    random_delay()  # 随机延迟
    driver.get(url)
                    

4.3 User-Agent 轮换

                        USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]

def get_random_user_agent():
    return random.choice(USER_AGENTS)

# 更新浏览器配置
chrome_options.add_argument(f'user-agent={get_random_user_agent()}')
                    

五、数据存储

5.1 存储到 Excel

                        import pandas as pd

def save_to_excel(data, filename='products.xlsx'):
    """保存数据到 Excel"""
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"数据已保存到 {filename}")

# 使用示例
products = get_product_list("手机", pages=3)
save_to_excel(products)
                    

5.2 存储到数据库

                        import sqlite3

def save_to_database(data, db_file='products.db'):
    """保存数据到数据库"""
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # 创建表
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            price TEXT,
            url TEXT UNIQUE,
            sales TEXT,
            rating TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')

    # 插入数据
    for item in data:
        try:
            cursor.execute('''
                INSERT INTO products (title, price, url, sales, rating)
                VALUES (?, ?, ?, ?, ?)
            ''', (item['title'], item['price'], item['url'], item['sales'], item['rating']))
        except sqlite3.IntegrityError:
            print(f"URL 已存在: {item['url']}")

    conn.commit()
    conn.close()
    print("数据已保存到数据库")
                    

六、完整流程整合

                        def main():
    """主函数"""
    keyword = "无线耳机"
    pages = 5

    print(f"开始采集商品: {keyword}")

    # 1. 采集商品列表
    products = get_product_list(keyword, pages)
    print(f"共采集 {len(products)} 个商品")

    # 2. 采集商品详情
    detailed_products = []
    for i, product in enumerate(products):
        print(f"正在采集第 {i+1}/{len(products)} 个商品详情")
        try:
            detail = get_product_detail(product['url'])
            detail.update(product)
            detailed_products.append(detail)
            random_delay(2, 4)  # 随机延迟
        except Exception as e:
            print(f"采集失败: {e}")

    # 3. 保存数据
    save_to_excel(detailed_products, f"{keyword}_products.xlsx")
    save_to_database(detailed_products)

    # 4. 关闭浏览器
    driver.quit()
    print("采集完成！")

if __name__ == '__main__':
    main()
                    

七、异常处理和优化

                        import logging

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='spider.log'
)

logger = logging.getLogger(__name__)

def safe_request_with_retry(url, max_retries=3):
    """带重试的安全请求"""
    for attempt in range(max_retries):
        try:
            driver.get(url)
            random_delay(1, 2)
            return True
        except Exception as e:
            logger.error(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(5)  # 等待后重试

    logger.error(f"请求失败，已达最大重试次数: {url}")
    return False
                    

八、总结

本文完整介绍了电商商品数据爬取的实现过程，包括：

需求分析和环境搭建
商品列表、详情、评价的采集方法
反爬虫应对策略（登录、延迟、UA轮换）
数据存储方案（Excel、数据库）
异常处理和日志记录

使用 EasySpider 工具：在实际开发中，你可以使用 EasySpider 的 Curl 转 Python 功能快速生成请求代码，使用 JSON 格式化工具分析 API 响应，大大提高开发效率。

重要提醒：爬取电商数据时请注意遵守网站的服务条款，控制采集频率，不要对网站造成过大压力。本文仅供学习参考。

返回博客列表