简介:本文将分享房地产网站的数据采集经验,包括二手房、新房、租房等房源信息的采集。通过实际案例,你将学会如何处理分页、筛选条件、地图坐标等复杂功能,并进行房价趋势分析。
一、需求分析
采集目标:
- 房源基本信息(标题、价格、面积、户型)
- 位置信息(区域、商圈、地址)
- 房源详情(楼层、装修、朝向、年代)
- 配套设施(地铁、学校、医院等)
- 房源图片
分析目标:
- 各区域房价对比
- 户型与价格关系
- 价格趋势分析
- 地铁房溢价分析
二、技术实现
2.1 基础配置
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
from datetime import datetime
# 配置
BASE_URL = "https://house.example.com"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': BASE_URL
}
# 创建 Session
session = requests.Session()
session.headers.update(HEADERS)
2.2 二手房列表采集
def get_second_hand_list(city='北京', district='', pages=5):
"""获取二手房列表"""
house_list = []
for page in range(1, pages + 1):
# 构建URL
url = f"{BASE_URL}/ershoufang/{city}/{district}/pg{page}"
try:
response = session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 查找房源卡片
house_cards = soup.find_all('div', class_='list-item')
for card in house_cards:
try:
house = {
'title': card.find('div', class_='title').text.strip(),
'total_price': card.find('div', class_='total-price').text.strip(),
'unit_price': card.find('div', class_='unit-price').text.strip(),
'area': card.find('div', class_='area').text.strip(),
'layout': card.find('div', class_='layout').text.strip(),
'district': card.find('div', class_='district').text.strip(),
'address': card.find('div', class_='address').text.strip(),
'tags': [tag.text for tag in card.find_all('span', class_='tag')],
'house_url': card.find('a')['href'],
'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
# 提取价格(去掉"万"、"元/㎡"等单位)
house['total_price_num'] = extract_price_number(house['total_price'])
house['unit_price_num'] = extract_price_number(house['unit_price'])
# 提取面积数字
house['area_num'] = extract_area_number(house['area'])
house_list.append(house)
print(f"采集: {house['title'][:30]}...")
except (AttributeError, ValueError) as e:
continue
print(f"第 {page} 页完成,共 {len(house_cards)} 套")
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"请求失败: {e}")
return house_list
def extract_price_number(price_str):
"""提取价格数字"""
import re
match = re.search(r'[\d,]+\.?\d*', price_str.replace(',', ''))
return float(match.group()) if match else 0
def extract_area_number(area_str):
"""提取面积数字"""
import re
match = re.search(r'[\d\.]+', area_str)
return float(match.group()) if match else 0
2.3 房源详情采集
def get_house_detail(house_url):
"""获取房源详情"""
full_url = f"{BASE_URL}{house_url}" if not house_url.startswith('http') else house_url
try:
response = session.get(full_url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
detail = {
'floor': '',
'orientation': '',
'decoration': '',
'build_year': '',
'property_type': '',
'elevator': '',
'parking': '',
'community_name': '',
'community_url': '',
'images': []
}
# 提取楼层、朝向、装修等信息
basic_info = soup.find('div', class_='basic-info')
if basic_info:
info_items = basic_info.find_all('li')
for item in info_items:
text = item.text.strip()
if '楼层' in text:
detail['floor'] = text
elif '朝向' in text:
detail['orientation'] = text
elif '装修' in text:
detail['decoration'] = text
elif '年代' in text:
detail['build_year'] = text
elif '电梯' in text:
detail['elevator'] = text
elif '车位' in text:
detail['parking'] = text
# 小区信息
community_div = soup.find('div', class_='community-info')
if community_div:
community_link = community_div.find('a')
if community_link:
detail['community_name'] = community_link.text.strip()
detail['community_url'] = community_link['href']
# 房源图片
img_list = soup.find_all('img', class_='house-img')
detail['images'] = [img.get('src', '') for img in img_list]
# 配套设施
facilities = soup.find_all('li', class_='facility-item')
detail['facilities'] = [f.text.strip() for f in facilities]
# 地理位置信息
location_div = soup.find('div', class_='location-info')
if location_div:
# 提取地铁信息
subway_tags = location_div.find_all('span', class_='subway-tag')
detail['subway_stations'] = [tag.text.strip() for tag in subway_tags]
# 提取学校信息
school_tags = location_div.find_all('span', class_='school-tag')
detail['schools'] = [tag.text.strip() for tag in school_tags]
return detail
except Exception as e:
print(f"获取详情失败: {e}")
return {}
2.4 新房数据采集
def get_new_house_list(city='北京', pages=5):
"""获取新房列表"""
new_houses = []
for page in range(1, pages + 1):
url = f"{BASE_URL}/newhouse/{city}/pg{page}"
try:
response = session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', class_='new-house-card')
for card in cards:
try:
house = {
'project_name': card.find('div', class_='project-name').text.strip(),
'developer': card.find('div', class_='developer').text.strip(),
'district': card.find('div', class_='district').text.strip(),
'address': card.find('div', class_='address').text.strip(),
'price': card.find('div', class_='price').text.strip(),
'price_range': card.find('div', class_='price-range').text.strip(),
'avg_price': card.find('div', class_='avg-price').text.strip(),
'opening_time': card.find('div', class_='opening-time').text.strip(),
'property_type': card.find('div', class_='property-type').text.strip(),
'tags': [tag.text for tag in card.find_all('span', class_='tag')],
'house_url': card.find('a')['href']
}
# 提取均价
house['avg_price_num'] = extract_price_number(house['avg_price'])
new_houses.append(house)
print(f"采集: {house['project_name']}")
except (AttributeError, ValueError) as e:
continue
print(f"第 {page} 页完成")
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"请求失败: {e}")
return new_houses
2.5 租房数据采集
def get_rent_list(city='北京', pages=5):
"""获取租房列表"""
rent_list = []
for page in range(1, pages + 1):
url = f"{BASE_URL}/zufang/{city}/pg{page}"
try:
response = session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', class_='rent-card')
for card in cards:
try:
house = {
'title': card.find('div', class_='title').text.strip(),
'price': card.find('div', class_='price').text.strip(),
'area': card.find('div', class_='area').text.strip(),
'layout': card.find('div', class_='layout').text.strip(),
'district': card.find('div', class_='district').text.strip(),
'community': card.find('div', class_='community').text.strip(),
'floor': card.find('div', class_='floor').text.strip(),
'tags': [tag.text for tag in card.find_all('span', class_='tag')],
'house_url': card.find('a')['href']
}
# 提取价格数字
house['price_num'] = extract_price_number(house['price'])
# 提取面积数字
house['area_num'] = extract_area_number(house['area'])
rent_list.append(house)
print(f"采集: {house['title'][:30]}...")
except (AttributeError, ValueError) as e:
continue
print(f"第 {page} 页完成")
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"请求失败: {e}")
return rent_list
三、数据分析
3.1 各区域房价对比
import matplotlib.pyplot as plt
import seaborn as sns
def analyze_price_by_district(df):
"""各区域房价分析"""
# 按区域分组,计算平均价格
district_price = df.groupby('district')['total_price_num'].agg(['mean', 'count'])
district_price = district_price.sort_values('mean', ascending=False).head(15)
# 可视化
plt.figure(figsize=(14, 6))
plt.bar(district_price.index, district_price['mean'])
plt.title('各区域平均房价(Top 15)')
plt.xlabel('区域')
plt.ylabel('平均价格(万元)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('price_by_district.png')
return district_price
3.2 户型与价格关系
def analyze_price_by_layout(df):
"""户型与价格关系分析"""
# 提取户型信息(如:3室2厅 -> 3室)
df['rooms'] = df['layout'].str.extract(r'(\d+)室')[0].astype(float)
# 按房间数分组
layout_price = df.groupby('rooms')['total_price_num'].agg(['mean', 'count', 'min', 'max'])
# 可视化
plt.figure(figsize=(12, 6))
plt.bar(layout_price.index, layout_price['mean'])
plt.title('不同户型平均房价')
plt.xlabel('房间数')
plt.ylabel('平均价格(万元)')
plt.xticks(layout_price.index)
plt.savefig('price_by_layout.png')
return layout_price
3.3 单价与面积关系
def analyze_unit_price_area(df):
"""单价与面积关系分析"""
plt.figure(figsize=(12, 8))
plt.scatter(df['area_num'], df['unit_price_num'], alpha=0.5)
plt.title('单价与面积关系')
plt.xlabel('面积(㎡)')
plt.ylabel('单价(元/㎡)')
plt.savefig('unit_price_area.png')
# 计算相关系数
correlation = df['area_num'].corr(df['unit_price_num'])
print(f"面积与单价的相关系数: {correlation:.3f}")
return correlation
3.4 地铁房溢价分析
def analyze_subway_premium(df):
"""地铁房溢价分析"""
# 标记是否有地铁
df['has_subway'] = df['tags'].apply(lambda x: any('地铁' in tag for tag in x))
# 对比价格
subway_avg = df[df['has_subway']]['unit_price_num'].mean()
non_subway_avg = df[~df['has_subway']]['unit_price_num'].mean()
premium = (subway_avg - non_subway_avg) / non_subway_avg * 100
print(f"地铁房平均单价: {subway_avg:.0f} 元/㎡")
print(f"非地铁房平均单价: {non_subway_avg:.0f} 元/㎡")
print(f"地铁房溢价: {premium:.2f}%")
# 可视化
plt.figure(figsize=(10, 6))
plt.bar(['地铁房', '非地铁房'], [subway_avg, non_subway_avg])
plt.title('地铁房与非地铁房单价对比')
plt.ylabel('单价(元/㎡)')
plt.savefig('subway_premium.png')
return premium
四、数据存储
4.1 存储到数据库
import sqlite3
def save_to_database(df, table_name='houses'):
"""保存到数据库"""
conn = sqlite3.connect('real_estate.db')
df.to_sql(table_name, conn, if_exists='append', index=False)
conn.close()
print(f"数据已保存到数据库表: {table_name}")
4.2 导出报表
def generate_report(df, output_file='house_report.xlsx'):
"""生成报表"""
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
# 原始数据
df.to_excel(writer, sheet_name='原始数据', index=False)
# 各区域房价
district_price = df.groupby('district')['total_price_num'].agg(['mean', 'count', 'min', 'max'])
district_price.to_excel(writer, sheet_name='区域房价')
# 户型统计
layout_stats = df['layout'].value_counts()
layout_stats.to_excel(writer, sheet_name='户型统计')
# 价格区间
price_bins = [0, 200, 300, 400, 500, 600, 800, 1000, float('inf')]
price_labels = ['200万以下', '200-300万', '300-400万', '400-500万',
'500-600万', '600-800万', '800-1000万', '1000万以上']
df['price_range'] = pd.cut(df['total_price_num'], bins=price_bins, labels=price_labels)
price_range_stats = df['price_range'].value_counts().sort_index()
price_range_stats.to_excel(writer, sheet_name='价格区间')
print(f"报表已生成: {output_file}")
五、完整流程
def main():
"""主流程"""
city = '北京'
print(f"开始采集 {city} 的房产数据...")
# 1. 采集二手房数据
print("\n步骤1: 采集二手房数据")
second_hand = get_second_hand_list(city, pages=10)
# 采集详情
print("采集二手房详情...")
for i, house in enumerate(second_hand[:100], 1): # 只采集前100套的详情
print(f"{i}/{100}")
detail = get_house_detail(house['house_url'])
house.update(detail)
time.sleep(random.uniform(1, 2))
# 2. 采集新房数据
print("\n步骤2: 采集新房数据")
new_houses = get_new_house_list(city, pages=5)
# 3. 采集租房数据
print("\n步骤3: 采集租房数据")
rent_list = get_rent_list(city, pages=5)
# 4. 数据清洗
print("\n步骤4: 数据清洗")
df_second = pd.DataFrame(second_hand)
df_new = pd.DataFrame(new_houses)
df_rent = pd.DataFrame(rent_list)
# 5. 数据分析
print("\n步骤5: 数据分析")
analyze_price_by_district(df_second)
analyze_price_by_layout(df_second)
analyze_unit_price_area(df_second)
analyze_subway_premium(df_second)
# 6. 保存数据
print("\n步骤6: 保存数据")
save_to_database(df_second, 'second_hand_houses')
save_to_database(df_new, 'new_houses')
save_to_database(df_rent, 'rent_houses')
generate_report(df_second)
print("\n采集和分析完成!")
if __name__ == '__main__':
main()
六、总结
本文完整介绍了房地产数据的采集和分析流程,包括:
- 二手房、新房、租房数据采集
- 房源详细信息提取
- 价格、面积、户型等关键数据分析
- 区域对比、地铁房溢价等专业分析
- 数据存储和报表生成
使用 EasySpider 工具:在分析房产网站时,可以使用 EasySpider 的 URL 参数提取工具分析筛选条件的URL结构,快速生成采集脚本。
重要提醒:房产数据采集涉及商业信息,请遵守相关网站的服务条款。本文仅供学习研究使用,不得用于商业用途。