房地产信息采集实战

房价数据、房源信息采集与分析

简介:本文将分享房地产网站的数据采集经验,包括二手房、新房、租房等房源信息的采集。通过实际案例,你将学会如何处理分页、筛选条件、地图坐标等复杂功能,并进行房价趋势分析。

一、需求分析

采集目标:

  • 房源基本信息(标题、价格、面积、户型)
  • 位置信息(区域、商圈、地址)
  • 房源详情(楼层、装修、朝向、年代)
  • 配套设施(地铁、学校、医院等)
  • 房源图片

分析目标:

  • 各区域房价对比
  • 户型与价格关系
  • 价格趋势分析
  • 地铁房溢价分析

二、技术实现

2.1 基础配置

import requests from bs4 import BeautifulSoup import pandas as pd import time import random import json from datetime import datetime # 配置 BASE_URL = "https://house.example.com" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Referer': BASE_URL } # 创建 Session session = requests.Session() session.headers.update(HEADERS)

2.2 二手房列表采集

def get_second_hand_list(city='北京', district='', pages=5): """获取二手房列表""" house_list = [] for page in range(1, pages + 1): # 构建URL url = f"{BASE_URL}/ershoufang/{city}/{district}/pg{page}" try: response = session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 查找房源卡片 house_cards = soup.find_all('div', class_='list-item') for card in house_cards: try: house = { 'title': card.find('div', class_='title').text.strip(), 'total_price': card.find('div', class_='total-price').text.strip(), 'unit_price': card.find('div', class_='unit-price').text.strip(), 'area': card.find('div', class_='area').text.strip(), 'layout': card.find('div', class_='layout').text.strip(), 'district': card.find('div', class_='district').text.strip(), 'address': card.find('div', class_='address').text.strip(), 'tags': [tag.text for tag in card.find_all('span', class_='tag')], 'house_url': card.find('a')['href'], 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } # 提取价格(去掉"万"、"元/㎡"等单位) house['total_price_num'] = extract_price_number(house['total_price']) house['unit_price_num'] = extract_price_number(house['unit_price']) # 提取面积数字 house['area_num'] = extract_area_number(house['area']) house_list.append(house) print(f"采集: {house['title'][:30]}...") except (AttributeError, ValueError) as e: continue print(f"第 {page} 页完成,共 {len(house_cards)} 套") time.sleep(random.uniform(2, 4)) except Exception as e: print(f"请求失败: {e}") return house_list def extract_price_number(price_str): """提取价格数字""" import re match = re.search(r'[\d,]+\.?\d*', price_str.replace(',', '')) return float(match.group()) if match else 0 def extract_area_number(area_str): """提取面积数字""" import re match = re.search(r'[\d\.]+', area_str) return float(match.group()) if match else 0

2.3 房源详情采集

def get_house_detail(house_url): """获取房源详情""" full_url = f"{BASE_URL}{house_url}" if not house_url.startswith('http') else house_url try: response = session.get(full_url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') detail = { 'floor': '', 'orientation': '', 'decoration': '', 'build_year': '', 'property_type': '', 'elevator': '', 'parking': '', 'community_name': '', 'community_url': '', 'images': [] } # 提取楼层、朝向、装修等信息 basic_info = soup.find('div', class_='basic-info') if basic_info: info_items = basic_info.find_all('li') for item in info_items: text = item.text.strip() if '楼层' in text: detail['floor'] = text elif '朝向' in text: detail['orientation'] = text elif '装修' in text: detail['decoration'] = text elif '年代' in text: detail['build_year'] = text elif '电梯' in text: detail['elevator'] = text elif '车位' in text: detail['parking'] = text # 小区信息 community_div = soup.find('div', class_='community-info') if community_div: community_link = community_div.find('a') if community_link: detail['community_name'] = community_link.text.strip() detail['community_url'] = community_link['href'] # 房源图片 img_list = soup.find_all('img', class_='house-img') detail['images'] = [img.get('src', '') for img in img_list] # 配套设施 facilities = soup.find_all('li', class_='facility-item') detail['facilities'] = [f.text.strip() for f in facilities] # 地理位置信息 location_div = soup.find('div', class_='location-info') if location_div: # 提取地铁信息 subway_tags = location_div.find_all('span', class_='subway-tag') detail['subway_stations'] = [tag.text.strip() for tag in subway_tags] # 提取学校信息 school_tags = location_div.find_all('span', class_='school-tag') detail['schools'] = [tag.text.strip() for tag in school_tags] return detail except Exception as e: print(f"获取详情失败: {e}") return {}

2.4 新房数据采集

def get_new_house_list(city='北京', pages=5): """获取新房列表""" new_houses = [] for page in range(1, pages + 1): url = f"{BASE_URL}/newhouse/{city}/pg{page}" try: response = session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') cards = soup.find_all('div', class_='new-house-card') for card in cards: try: house = { 'project_name': card.find('div', class_='project-name').text.strip(), 'developer': card.find('div', class_='developer').text.strip(), 'district': card.find('div', class_='district').text.strip(), 'address': card.find('div', class_='address').text.strip(), 'price': card.find('div', class_='price').text.strip(), 'price_range': card.find('div', class_='price-range').text.strip(), 'avg_price': card.find('div', class_='avg-price').text.strip(), 'opening_time': card.find('div', class_='opening-time').text.strip(), 'property_type': card.find('div', class_='property-type').text.strip(), 'tags': [tag.text for tag in card.find_all('span', class_='tag')], 'house_url': card.find('a')['href'] } # 提取均价 house['avg_price_num'] = extract_price_number(house['avg_price']) new_houses.append(house) print(f"采集: {house['project_name']}") except (AttributeError, ValueError) as e: continue print(f"第 {page} 页完成") time.sleep(random.uniform(2, 4)) except Exception as e: print(f"请求失败: {e}") return new_houses

2.5 租房数据采集

def get_rent_list(city='北京', pages=5): """获取租房列表""" rent_list = [] for page in range(1, pages + 1): url = f"{BASE_URL}/zufang/{city}/pg{page}" try: response = session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') cards = soup.find_all('div', class_='rent-card') for card in cards: try: house = { 'title': card.find('div', class_='title').text.strip(), 'price': card.find('div', class_='price').text.strip(), 'area': card.find('div', class_='area').text.strip(), 'layout': card.find('div', class_='layout').text.strip(), 'district': card.find('div', class_='district').text.strip(), 'community': card.find('div', class_='community').text.strip(), 'floor': card.find('div', class_='floor').text.strip(), 'tags': [tag.text for tag in card.find_all('span', class_='tag')], 'house_url': card.find('a')['href'] } # 提取价格数字 house['price_num'] = extract_price_number(house['price']) # 提取面积数字 house['area_num'] = extract_area_number(house['area']) rent_list.append(house) print(f"采集: {house['title'][:30]}...") except (AttributeError, ValueError) as e: continue print(f"第 {page} 页完成") time.sleep(random.uniform(2, 4)) except Exception as e: print(f"请求失败: {e}") return rent_list

三、数据分析

3.1 各区域房价对比

import matplotlib.pyplot as plt import seaborn as sns def analyze_price_by_district(df): """各区域房价分析""" # 按区域分组,计算平均价格 district_price = df.groupby('district')['total_price_num'].agg(['mean', 'count']) district_price = district_price.sort_values('mean', ascending=False).head(15) # 可视化 plt.figure(figsize=(14, 6)) plt.bar(district_price.index, district_price['mean']) plt.title('各区域平均房价(Top 15)') plt.xlabel('区域') plt.ylabel('平均价格(万元)') plt.xticks(rotation=45) plt.tight_layout() plt.savefig('price_by_district.png') return district_price

3.2 户型与价格关系

def analyze_price_by_layout(df): """户型与价格关系分析""" # 提取户型信息(如:3室2厅 -> 3室) df['rooms'] = df['layout'].str.extract(r'(\d+)室')[0].astype(float) # 按房间数分组 layout_price = df.groupby('rooms')['total_price_num'].agg(['mean', 'count', 'min', 'max']) # 可视化 plt.figure(figsize=(12, 6)) plt.bar(layout_price.index, layout_price['mean']) plt.title('不同户型平均房价') plt.xlabel('房间数') plt.ylabel('平均价格(万元)') plt.xticks(layout_price.index) plt.savefig('price_by_layout.png') return layout_price

3.3 单价与面积关系

def analyze_unit_price_area(df): """单价与面积关系分析""" plt.figure(figsize=(12, 8)) plt.scatter(df['area_num'], df['unit_price_num'], alpha=0.5) plt.title('单价与面积关系') plt.xlabel('面积(㎡)') plt.ylabel('单价(元/㎡)') plt.savefig('unit_price_area.png') # 计算相关系数 correlation = df['area_num'].corr(df['unit_price_num']) print(f"面积与单价的相关系数: {correlation:.3f}") return correlation

3.4 地铁房溢价分析

def analyze_subway_premium(df): """地铁房溢价分析""" # 标记是否有地铁 df['has_subway'] = df['tags'].apply(lambda x: any('地铁' in tag for tag in x)) # 对比价格 subway_avg = df[df['has_subway']]['unit_price_num'].mean() non_subway_avg = df[~df['has_subway']]['unit_price_num'].mean() premium = (subway_avg - non_subway_avg) / non_subway_avg * 100 print(f"地铁房平均单价: {subway_avg:.0f} 元/㎡") print(f"非地铁房平均单价: {non_subway_avg:.0f} 元/㎡") print(f"地铁房溢价: {premium:.2f}%") # 可视化 plt.figure(figsize=(10, 6)) plt.bar(['地铁房', '非地铁房'], [subway_avg, non_subway_avg]) plt.title('地铁房与非地铁房单价对比') plt.ylabel('单价(元/㎡)') plt.savefig('subway_premium.png') return premium

四、数据存储

4.1 存储到数据库

import sqlite3 def save_to_database(df, table_name='houses'): """保存到数据库""" conn = sqlite3.connect('real_estate.db') df.to_sql(table_name, conn, if_exists='append', index=False) conn.close() print(f"数据已保存到数据库表: {table_name}")

4.2 导出报表

def generate_report(df, output_file='house_report.xlsx'): """生成报表""" with pd.ExcelWriter(output_file, engine='openpyxl') as writer: # 原始数据 df.to_excel(writer, sheet_name='原始数据', index=False) # 各区域房价 district_price = df.groupby('district')['total_price_num'].agg(['mean', 'count', 'min', 'max']) district_price.to_excel(writer, sheet_name='区域房价') # 户型统计 layout_stats = df['layout'].value_counts() layout_stats.to_excel(writer, sheet_name='户型统计') # 价格区间 price_bins = [0, 200, 300, 400, 500, 600, 800, 1000, float('inf')] price_labels = ['200万以下', '200-300万', '300-400万', '400-500万', '500-600万', '600-800万', '800-1000万', '1000万以上'] df['price_range'] = pd.cut(df['total_price_num'], bins=price_bins, labels=price_labels) price_range_stats = df['price_range'].value_counts().sort_index() price_range_stats.to_excel(writer, sheet_name='价格区间') print(f"报表已生成: {output_file}")

五、完整流程

def main(): """主流程""" city = '北京' print(f"开始采集 {city} 的房产数据...") # 1. 采集二手房数据 print("\n步骤1: 采集二手房数据") second_hand = get_second_hand_list(city, pages=10) # 采集详情 print("采集二手房详情...") for i, house in enumerate(second_hand[:100], 1): # 只采集前100套的详情 print(f"{i}/{100}") detail = get_house_detail(house['house_url']) house.update(detail) time.sleep(random.uniform(1, 2)) # 2. 采集新房数据 print("\n步骤2: 采集新房数据") new_houses = get_new_house_list(city, pages=5) # 3. 采集租房数据 print("\n步骤3: 采集租房数据") rent_list = get_rent_list(city, pages=5) # 4. 数据清洗 print("\n步骤4: 数据清洗") df_second = pd.DataFrame(second_hand) df_new = pd.DataFrame(new_houses) df_rent = pd.DataFrame(rent_list) # 5. 数据分析 print("\n步骤5: 数据分析") analyze_price_by_district(df_second) analyze_price_by_layout(df_second) analyze_unit_price_area(df_second) analyze_subway_premium(df_second) # 6. 保存数据 print("\n步骤6: 保存数据") save_to_database(df_second, 'second_hand_houses') save_to_database(df_new, 'new_houses') save_to_database(df_rent, 'rent_houses') generate_report(df_second) print("\n采集和分析完成!") if __name__ == '__main__': main()

六、总结

本文完整介绍了房地产数据的采集和分析流程,包括:

  • 二手房、新房、租房数据采集
  • 房源详细信息提取
  • 价格、面积、户型等关键数据分析
  • 区域对比、地铁房溢价等专业分析
  • 数据存储和报表生成

使用 EasySpider 工具:在分析房产网站时,可以使用 EasySpider 的 URL 参数提取工具分析筛选条件的URL结构,快速生成采集脚本。

重要提醒:房产数据采集涉及商业信息,请遵守相关网站的服务条款。本文仅供学习研究使用,不得用于商业用途。