-- coding: utf-8 --
from bs4 import BeautifulSoup
import xlwt
from selenium import webdriver
模拟Firefox登陆
url = “https://book.douban.com/"
driver = webdriver.Firefox()
driver.get(url)
获取用户输入
book = input(“Title:”)
查找并输入 id=”inp-query”是该url搜索框的id
driver.find_element_by_id(“inp-query”).send_keys(book)
模拟点击搜索
driver.find_element_by_xpath(“//*[@type=’submit’]”).click()
进入第一个相关链接
driver.find_element_by_xpath(“//*[@class=’title-text’]”).click()
保存第一个相关链接的url(即收集数据的url)
book_url = driver.current_url
对当前页面进行bs解析
soup = BeautifulSoup(driver.page_source, ‘html.parser’)
读取本书具有的信息的名称
book_content_list = []
names = soup.find_all(‘span’,class_=”pl”)
for i in names:
book_content_list.append(i.string)
对当前页面进行修改并进行bs解析
other = driver.page_source.replace(‘‘,’‘)
others = other.replace(‘
‘,’‘)
soup = BeautifulSoup(others, ‘html.parser’)
读取书名,作者,出版社,出版年,页数,定价等内容
book_list = []
other = soup.find(‘div’,id=”info”,class_=””)
others = other.find_all(‘a’)
for i in others:
book_list.append(i.string)
book_list = list(filter(None,book_list))
New_book_list = []
for i in book_list:
i = i.replace(‘ ‘,’’)
i = i.replace(‘\n’,’’)
New_book_list.append(i)
输出本书具有的信息
length = len(New_book_list)
for i in range(0,length):
print(end = book_content_list[i])
print(New_book_list[i])
对url进行bs解析
soup = BeautifulSoup(driver.page_source,’html.parser’)
读取内容简介,作者简介
contend_list = []
New_contend_list = []
about_content = []
about_anthor = []
content = soup.find_all(‘div’,class_=”intro”)
for i in content:
contents = i.find_all(‘p’)
New_contend_list.append(contents)
if len(New_contend_list) == 3:
del(New_contend_list[1])
elif len(New_contend_list) == 4:
del(New_contend_list[0])
del(New_contend_list[2])
content = New_contend_list[0]
anthor = New_contend_list[1]
for i in content:
about_content.append(i.string)
for i in anthor:
about_anthor.append(i.string)
关闭网页
driver.close()
输出
print(end = “内容简介:”)
for i in about_content:
print(end = i)
print()
print(end = “作者简介:”)
for i in about_anthor:
print(end = i)
print()
确定购买的url
add_url = “buylinks”
buy_url = book_url + add_url
动态爬取基础步骤(一:模仿为Firefox 二:读取网站)
driver2 = webdriver.Firefox()
driver2.get(buy_url)
对buy_url进行bs解析
soup = BeautifulSoup(driver2.page_source,’html.parser’)
读取图书的各网点的报价
price_information_list = []
New_price_information_list = []
price_informations = soup.find_all(‘td’,class_=”pl2”)
for i in price_informations:
price_information = i.find(‘a’)
price_information_list.append(price_information)
price_information_list = list(filter(None,price_information_list))
for i in price_information_list:
New_price_information_list.append(i.string)
关闭网页
driver2.close()
比较那个网店的价格最便宜
min = New_price_information_list[1]
flag = 1
length = len(New_price_information_list)
for i in range(3,length):
if i%2 == 1:
if min > New_price_information_list[i]:
min = New_price_information_list[i]
flag = i
输出最便宜的网店名以及价格
print(“最便宜的网店名以及价格:%s:%s” %(New_price_information_list[flag-1],New_price_information_list[flag]))
创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = ‘utf-8’)
创建一个worksheet
worksheet = workbook.add_sheet(‘Book finder’)
写入excel
整合内容简介,作者简介
content_text = ‘’.join(about_content)
anthor_text = ‘’.join(about_anthor)
导入数据
worksheet.write(0, 0, label = ‘书名:’)
worksheet.write(0, 1, label = ‘%s’ %book)
length = len(New_book_list)
for i in range(0,length):
worksheet.write(i+1, 0, label = ‘%s’ %book_content_list[i])
worksheet.write(i+1, 1, label = ‘%s’ %New_book_list[i])
hang = length + 1
worksheet.write(hang, 0, label = ‘内容简介:’)
worksheet.write(hang, 1, label = ‘%s’ %content_text)
worksheet.write(hang+1, 0, label = ‘作者简介:’)
worksheet.write(hang+1, 1, label = ‘%s’ %anthor_text)
worksheet.write(hang+2, 0, label = ‘购买建议:’)
worksheet.write(hang+2, 1, label = ‘%s’ %New_price_information_list[flag-1])
worksheet.write(hang+2, 2, label = ‘%s’ %New_price_information_list[flag])
设置行列间隔
worksheet.col(0).width = 2569
worksheet.col(1).width = 25622
保存
workbook.save(‘Book finder.xls’)