招聘网站信息采集实战

简介：本文将分享如何从招聘网站采集职位数据,包括职位名称、薪资范围、公司信息、任职要求等。通过实际案例,你将学会处理分页、解析复杂HTML、提取结构化数据等技巧。

一、项目需求

采集目标：

职位基本信息（职位名称、薪资、地点、经验要求）
公司信息（公司名称、规模、行业、福利）
职位详情（岗位职责、任职要求）
发布时间、浏览量等信息

数据分析目标：

不同职位的薪资水平分析
热门技术栈统计
地区招聘热度分析
公司规模与薪资关系

二、技术方案

2.1 使用 Requests + BeautifulSoup

                        import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# 配置请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection': 'keep-alive'
}

# 创建 Session
session = requests.Session()
session.headers.update(headers)
                    

2.2 职位列表采集

                        def get_job_list(keyword, city='北京', pages=10):
    """获取职位列表"""
    job_list = []

    for page in range(1, pages + 1):
        url = f"https://jobs.example.com/search?keyword={keyword}&city={city}&page={page}"

        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # 查找职位卡片
            job_cards = soup.find_all('div', class_='job-card')

            for card in job_cards:
                try:
                    job = {
                        'job_title': card.find('a', class_='job-title').text.strip(),
                        'company_name': card.find('a', class_='company-name').text.strip(),
                        'salary': card.find('span', class_='salary').text.strip(),
                        'location': card.find('span', class_='location').text.strip(),
                        'experience': card.find('span', class_='experience').text.strip(),
                        'education': card.find('span', class_='education').text.strip(),
                        'job_url': card.find('a', class_='job-title')['href'],
                        'company_url': card.find('a', class_='company-name')['href'],
                        'tags': [tag.text for tag in card.find_all('span', class_='tag')],
                        'publish_time': card.find('span', class_='publish-time').text.strip()
                    }
                    job_list.append(job)
                    print(f"采集: {job['job_title']} - {job['company_name']}")
                except AttributeError as e:
                    print(f"解析失败: {e}")
                    continue

            print(f"第 {page} 页采集完成，共 {len(job_cards)} 个职位")
            time.sleep(random.uniform(1, 2))  # 随机延迟

        except Exception as e:
            print(f"请求失败: {e}")
            continue

    return job_list
                    

2.3 职位详情采集

                        def get_job_detail(job_url):
    """获取职位详情"""
    full_url = f"https://jobs.example.com{job_url}"

    try:
        response = session.get(full_url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # 提取详情
        detail = {
            'job_description': soup.find('div', class_='job-description').get_text(separator='\n').strip(),
            'requirements': soup.find('div', class_='job-requirements').get_text(separator='\n').strip(),
            'benefits': [ben.text.strip() for ben in soup.find_all('span', class_='benefit-item')],
            'company_size': soup.find('span', class_='company-size').text.strip() if soup.find('span', class_='company-size') else '',
            'company_industry': soup.find('span', class_='company-industry').text.strip() if soup.find('span', class_='company-industry') else '',
            'view_count': soup.find('span', class_='view-count').text.strip(),
            'apply_count': soup.find('span', class_='apply-count').text.strip()
        }

        return detail

    except Exception as e:
        print(f"获取详情失败: {e}")
        return {}
                    

三、数据清洗和处理

3.1 薪资数据解析

                        import re

def parse_salary(salary_str):
    """解析薪资字符串"""
    if '天' in salary_str:
        # 日薪：如 300-500/天
        match = re.search(r'(\d+)-(\d+)', salary_str)
        if match:
            min_salary = int(match.group(1)) * 21  # 按每月21个工作日计算
            max_salary = int(match.group(2)) * 21
            return min_salary, max_salary
    elif '月' in salary_str or 'k' in salary_str.lower():
        # 月薪：如 15-25K 或 15000-25000
        match = re.search(r'(\d+)[kK]?', salary_str)
        if match:
            parts = salary_str.split('-')
            min_salary = int(re.sub(r'[kK]', '', parts[0])) * 1000
            max_salary = int(re.sub(r'[kK]', '', parts[1])) * 1000
            return min_salary, max_salary
    elif '年' in salary_str:
        # 年薪
        match = re.search(r'(\d+)-(\d+)', salary_str)
        if match:
            min_salary = int(match.group(1)) * 1000 // 12
            max_salary = int(match.group(2)) * 1000 // 12
            return min_salary, max_salary

    return 0, 0

# 使用示例
salary_str = "15-25K·14薪"
min_sal, max_sal = parse_salary(salary_str)
print(f"薪资范围: {min_sal} - {max_sal} 元/月")
                    

3.2 技术栈提取

                        # 常见技术栈列表
TECH_STACKS = {
    '后端': ['Java', 'Python', 'Go', 'C++', 'Node.js', 'PHP', 'Spring', 'Django', 'Flask'],
    '前端': ['Vue', 'React', 'Angular', 'JavaScript', 'TypeScript', 'HTML', 'CSS'],
    '数据库': ['MySQL', 'Redis', 'MongoDB', 'PostgreSQL', 'Oracle'],
    '中间件': ['Kafka', 'RabbitMQ', 'Nginx', 'Docker', 'Kubernetes'],
    '大数据': ['Hadoop', 'Spark', 'Flink', 'Hive', 'HBase']
}

def extract_tech_stacks(text):
    """从职位描述中提取技术栈"""
    found_techs = {'后端': [], '前端': [], '数据库': [], '中间件': [], '大数据': []}

    for category, techs in TECH_STACKS.items():
        for tech in techs:
            if tech.lower() in text.lower():
                found_techs[category].append(tech)

    return found_techs

# 使用示例
job_desc = "精通Java、Spring Boot，熟悉Redis、MySQL，了解Docker"
techs = extract_tech_stacks(job_desc)
print(techs)
                    

四、数据分析

4.1 薪资分析

                        import matplotlib.pyplot as plt
import seaborn as sns

def analyze_salary(df):
    """薪资分析"""
    # 计算平均薪资
    df['avg_salary'] = (df['min_salary'] + df['max_salary']) / 2

    # 薪资分布
    plt.figure(figsize=(12, 6))
    sns.histplot(df['avg_salary'], bins=30, kde=True)
    plt.title('薪资分布')
    plt.xlabel('月薪（元）')
    plt.ylabel('职位数量')
    plt.savefig('salary_distribution.png')

    # 各职位平均薪资
    salary_by_job = df.groupby('job_title')['avg_salary'].mean().sort_values(ascending=False).head(10)

    plt.figure(figsize=(12, 6))
    salary_by_job.plot(kind='bar')
    plt.title('各职位平均薪资（Top 10）')
    plt.xlabel('职位')
    plt.ylabel('平均薪资（元）')
    plt.xticks(rotation=45)
    plt.savefig('salary_by_job.png')

    return df
                    

4.2 技术栈热度统计

                        from collections import Counter

def analyze_tech_stacks(df):
    """统计技术栈热度"""
    all_techs = []
    for tech_dict in df['tech_stacks']:
        for category, techs in tech_dict.items():
            all_techs.extend(techs)

    tech_counter = Counter(all_techs)
    top_techs = tech_counter.most_common(20)

    # 可视化
    plt.figure(figsize=(12, 8))
    techs, counts = zip(*top_techs)
    plt.barh(range(len(techs)), counts)
    plt.yticks(range(len(techs)), techs)
    plt.xlabel('出现次数')
    plt.title('技术栈热度（Top 20）')
    plt.tight_layout()
    plt.savefig('tech_stack_popularity.png')

    return top_techs
                    

五、反爬虫应对

5.1 请求频率控制

                        import time
import random

class RateLimiter:
    """请求限速器"""
    def __init__(self, min_delay=1, max_delay=3):
        self.min_delay = min_delay
        self.max_delay = max_delay

    def wait(self):
        """等待随机时间"""
        delay = random.uniform(self.min_delay, self.max_delay)
        time.sleep(delay)

# 使用
limiter = RateLimiter(min_delay=2, max_delay=5)
for url in urls:
    response = session.get(url)
    limiter.wait()  # 每次请求后随机等待
                    

5.2 Cookie 管理

                        def save_cookies(cookies_file='cookies.json'):
    """保存 Cookie"""
    import json
    cookies_dict = {cookie.name: cookie.value for cookie in session.cookies}
    with open(cookies_file, 'w') as f:
        json.dump(cookies_dict, f)

def load_cookies(cookies_file='cookies.json'):
    """加载 Cookie"""
    import json
    try:
        with open(cookies_file, 'r') as f:
            cookies_dict = json.load(f)
        for name, value in cookies_dict.items():
            session.cookies.set(name, value)
        print("Cookie 加载成功")
    except FileNotFoundError:
        print("Cookie 文件不存在")
                    

六、完整流程

                        def main():
    """主流程"""
    # 参数配置
    keyword = "Python工程师"
    city = "北京"
    pages = 20

    print(f"开始采集 {city} 的 {keyword} 职位...")

    # 1. 采集职位列表
    print("步骤1: 采集职位列表")
    job_list = get_job_list(keyword, city, pages)
    print(f"共采集 {len(job_list)} 个职位")

    # 2. 采集职位详情
    print("步骤2: 采集职位详情")
    for i, job in enumerate(job_list, 1):
        print(f"采集详情: {i}/{len(job_list)}")
        detail = get_job_detail(job['job_url'])
        job.update(detail)
        time.sleep(random.uniform(1, 2))

    # 3. 数据清洗
    print("步骤3: 数据清洗")
    df = pd.DataFrame(job_list)

    # 解析薪资
    df[['min_salary', 'max_salary']] = df['salary'].apply(
        lambda x: pd.Series(parse_salary(x))
    )

    # 提取技术栈
    df['tech_stacks'] = df['job_description'].apply(extract_tech_stacks)

    # 4. 保存数据
    print("步骤4: 保存数据")
    df.to_excel(f"{keyword}_{city}_jobs.xlsx", index=False, engine='openpyxl')
    df.to_csv(f"{keyword}_{city}_jobs.csv", index=False, encoding='utf-8-sig')
    print(f"数据已保存")

    # 5. 数据分析
    print("步骤5: 数据分析")
    analyze_salary(df)
    analyze_tech_stacks(df)
    print("分析完成")

    print("全部完成！")

if __name__ == '__main__':
    main()
                    

七、数据示例

                        # 采集到的数据示例
{
    "job_title": "高级Python工程师",
    "company_name": "某某科技有限公司",
    "salary": "25-40K",
    "location": "北京·朝阳区",
    "experience": "5-10年",
    "education": "本科",
    "min_salary": 25000,
    "max_salary": 40000,
    "job_description": "岗位职责：\n1. 负责后端系统开发和优化\n2. 参与技术方案设计和评审\n3. 解决复杂技术问题",
    "tech_stacks": {
        "后端": ["Python", "Django", "Flask"],
        "数据库": ["MySQL", "Redis"],
        "中间件": ["Docker", "Kafka"]
    },
    "company_size": "100-499人",
    "company_industry": "互联网"
}
                    

八、总结

通过本文的学习,你已经掌握了：

招聘网站职位信息的采集方法
复杂数据的解析和清洗技巧
薪资数据的标准化处理
技术栈的智能提取
数据分析和可视化方法

使用 EasySpider 工具：在开发过程中，可以使用 EasySpider 的 Curl 转 Python 功能快速生成请求代码，使用 JSON 格式化工具分析 API 响应数据，大大提高开发效率。

法律提示：请遵守招聘网站的服务条款，仅用于学习和研究目的，不得用于商业用途或侵犯他人权益。

返回博客列表