python 抓取搜房网成交数据

#!/usr/bin/python
#-*- coding:utf-8 -*-
import time
import urllib2
import StringIO
import gzip
import string
import sys,os
import chardet
from bs4 import BeautifulSoup
import re

reload(sys)
sys.setdefaultencoding('utf-8')

def getlist(html):
soup = BeautifulSoup(html,from_encoding="utf-8")
divs = soup.find_all('div',{'class':'searchListNoraml'})
for div_html in divs:
company_li=div_html.findAll("li",{'class':'s2'})
price_div=div_html.findAll("div",{'class':'price'})
img_tmp=div_html.findAll("img",width='122')
name_div=div_html.findAll("div",{'class':'name'})
type_div=div_html.findAll("div",{'class':'dot6'})
if (company_li):
print "===============分割线========================"
print "["+img_tmp[0].get("alt")+"]-----["+img_tmp[0].get("src")+"]\
 ----["+price_div[0].get_text().strip()+"]"\
 +type_div[0].get_text().strip()
print "[http://newhouse.jiujiang.soufun.com"\
    +company_li[0].a.get('href')+"]----\
    ["+company_li[1].font.string+']'
print "["+name_div[0].a.string+"]的网址:"+name_div[0].a.get('href')

def getHtml(url):
headers={'User-Agent':'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)'}
opener = urllib2.build_opener()
request = urllib2.Request(url,headers=headers)
request.add_header('Accept-encodeing','gzip')
page = opener.open(request)
predata = page.read()
pdata = StringIO.StringIO(predata)
gzipper = gzip.GzipFile(fileobj = pdata)
try:
data = gzipper.read()
except(IOError):
print 'unused gzip'
data = predata

mychar=chardet.detect(data)
bianma = mychar['encoding']
if bianma == 'utf-8' or bianma == 'UTF-8':
html = data
else :
html = data.decode('gb2312','ignore').encode('utf-8')
return html

def getSouFun_NewHouseList():
for i in range(1,19,1):
url_tmp="http://newhouse.jiujiang.soufun.com/house/%BE%C5%BD%AD\
     _________________"+str(i)+"__.htm"
#print url_tmp
tags_html=getHtml(url_tmp)
time.sleep(2)
getlist(tags_html)
def getSouFun_everydayTrace():
print '<html xmlns=\"http://www.w3.org/1999/xhtml\">'
print '<head>'
print '<meta http-equiv=\"Content-Type\"\
   content=\"text/html; charset=utf-8\" />'
print '<title>九江城区--成交情况统计</title>'
print '<body>'
for i in range(1,2):
list_url="http://newhouse.jiujiang.soufun.com/house/web/\
     newhouse_news_more.php?type=12193&page="+str(i)
list_html=getHtml(list_url)

soup = BeautifulSoup(list_html,from_encoding="utf-8")
divs = soup.find_all('div',{'class':'lnews'})

#print divs
for div_html in divs:
#print div_html
li_tracelist=div_html.findAll("li",{'class':''})
for lit in li_tracelist:
print "<p>[<a href='http://newhouse.jiujiang.soufun.com"\
  +lit.a.get('href').replace('.','_all.')+"' >"\
  +lit.a.string+"平方米</a>]\
  的链接:http://newhouse.jiujiang.soufun.com"\
  +lit.a.get('href').replace('.','_all.')+"</p>"
print "<br/>"
getSouFun_trace_Data("http://newhouse.xxxx.soufun.com"\
  +lit.a.get('href').replace('.','_all.'))
time.sleep(2)
print "</body></html>"

def getSouFun_trace_Data(url):
data_html = getHtml(url)
soup = BeautifulSoup(data_html,from_encoding="utf-8")

#print soup
divs = soup.find_all('div',{'name':'news_content'})
if (not divs):
divs = soup.find_all('div',{'id':'news_body'})
for div_html in divs:
table_list=div_html.findAll('table',{'class':''})
if(table_list):
for table_html in table_list:
print table_html
print "<br/>"
print "<br/>"
img_list=div_html.findAll('img')
#img_list=div_html.findAll(name='img',attrs=\
 {'src',re.compile(r"(.*)news(.*).jpg")})
if(img_list):
for img_html in img_list:
print img_html
print "<br/>"
print "<br/>"
此条目发表在技术生涯, 生活分类目录,贴了标签。将固定链接加入收藏夹。

发表评论

电子邮件地址不会被公开。 必填项已用*标注