Hello, I am Caixia, trying to grab some housing transaction information from the webpage, i did it with xpath, some information is scrapped successfully, but some is not. I can confirm that the xpath is correct. My programme is following:
from bs4 import BeautifulSoup as soup
import re
import requests
from parsel import selector
from lxml import etree
import pandas as pd
import openpyxl
import time
# Get Chrome browser header information
# 在http请求中设置头部信息,以免被封ip;
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
finalurl = 'https://sz.lianjia.com/xiaoqu/luohuqu/pg1cro11/'
print(finalurl)
get_finalurl = requests.get(finalurl, headers = headers).content
finalurl_soup = soup(get_finalurl, 'html.parser')
finalurl_ul = finalurl_soup.find('ul', attrs={'class': 'listContent'})
findurl_all = finalurl_ul.find_all('a', attrs={'class': 'img'})
for all in findurl_all:
re_url = requests.get(all['href'])
re_html = re_url.text
re_html_e = etree.HTML(re_html)
info = { }
#xiaoqu base information
info['link'] = all['href']
info['xiaoqu_name'] = re_html_e.xpath('/html/body/div[4]/div/div[1]/h1/text()')[0]
info['xiaoqu_address'] = re_html_e.xpath('/html/body/div[4]/div/div[1]/div/text()')[0]
info['xiaoqu_city'] = re_html_e.xpath('/html/body/div[5]/div[1]/a[2]/text()')[0]
info['xiqoqu_chengqu'] = re_html_e.xpath('/html/body/div[5]/div[1]/a[3]/text()')[0]
info['xiaoqu_jiedao'] = re_html_e.xpath('/html/body/div[5]/div[1]/a[4]/text()')[0]
info['xiaoqu_price'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[1]/div/span[1]/text()')[0]
info['xiaoqu_age'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[1]/span[2]/text()')[0]
info['xiaoqu_buildtype'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[2]/span[2]/text()')[0]
info['xiaoqu_wuyefei'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[3]/span[2]/text()')[0]
info['xiaoqu_wuyecompany'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[4]/span[2]/text()')[0]
info['xiaoqu_developer'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[5]/span[2]/text()')[0]
info['xiaoqu_building_number'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[6]/span[2]/text()')[0]
info['xiaoqu_house_number'] = re_html_e.xpath('/html/body/div[6]/div[2]/div[2]/div[7]/span[2]/text()')[0]
#xiaoqu around information
info['xiaoqu_jiaotong_subway1_distance'] = re_html_e.xpath('//*[@id="mapListContainer"]/ul/li[1]/div/div[1]/span[4]/text()')
info['xiaoqu_jiaotong_subway2_distance'] = re_html_e.xpath('//*[@id="mapListContainer"]/ul/li[2]/div/div[1]/span[4]/text()')
info['xiaoqu_jiaotong_subway3_distance'] = re_html_e.xpath('//*[@id="mapListContainer"]/ul/li[3]/div/div[1]/span[4]/text()')
print(info)
The last three variables is null. Why?
What I have tried:
i have tried to scrape some basic community information variables and i got it. But the last three variables are null.