F_JustWei's Studio.

Book finder

字数统计: 712阅读时长: 3 min
2019/02/21 Share

-- coding: utf-8 --

from bs4 import BeautifulSoup
import xlwt
from selenium import webdriver

模拟Firefox登陆

url = “https://book.douban.com/"
driver = webdriver.Firefox()
driver.get(url)

获取用户输入

book = input(“Title:”)

查找并输入 id=”inp-query”是该url搜索框的id

driver.find_element_by_id(“inp-query”).send_keys(book)

模拟点击搜索

driver.find_element_by_xpath(“//*[@type=’submit’]”).click()

进入第一个相关链接

driver.find_element_by_xpath(“//*[@class=’title-text’]”).click()

保存第一个相关链接的url(即收集数据的url)

book_url = driver.current_url

对当前页面进行bs解析

soup = BeautifulSoup(driver.page_source, ‘html.parser’)

读取本书具有的信息的名称

book_content_list = []
names = soup.find_all(‘span’,class_=”pl”)
for i in names:
book_content_list.append(i.string)

对当前页面进行修改并进行bs解析

other = driver.page_source.replace(‘‘,’‘)
others = other.replace(‘
‘,’
‘)
soup = BeautifulSoup(others, ‘html.parser’)

读取书名,作者,出版社,出版年,页数,定价等内容

book_list = []
other = soup.find(‘div’,id=”info”,class_=””)
others = other.find_all(‘a’)
for i in others:
book_list.append(i.string)
book_list = list(filter(None,book_list))
New_book_list = []
for i in book_list:
i = i.replace(‘ ‘,’’)
i = i.replace(‘\n’,’’)
New_book_list.append(i)

输出本书具有的信息

length = len(New_book_list)
for i in range(0,length):
print(end = book_content_list[i])
print(New_book_list[i])

对url进行bs解析

soup = BeautifulSoup(driver.page_source,’html.parser’)

读取内容简介,作者简介

contend_list = []
New_contend_list = []
about_content = []
about_anthor = []
content = soup.find_all(‘div’,class_=”intro”)
for i in content:
contents = i.find_all(‘p’)
New_contend_list.append(contents)
if len(New_contend_list) == 3:
del(New_contend_list[1])
elif len(New_contend_list) == 4:
del(New_contend_list[0])
del(New_contend_list[2])
content = New_contend_list[0]
anthor = New_contend_list[1]
for i in content:
about_content.append(i.string)
for i in anthor:
about_anthor.append(i.string)

关闭网页

driver.close()

输出

print(end = “内容简介:”)
for i in about_content:
print(end = i)
print()
print(end = “作者简介:”)
for i in about_anthor:
print(end = i)
print()

确定购买的url

add_url = “buylinks”
buy_url = book_url + add_url

动态爬取基础步骤(一:模仿为Firefox 二:读取网站)

driver2 = webdriver.Firefox()
driver2.get(buy_url)

对buy_url进行bs解析

soup = BeautifulSoup(driver2.page_source,’html.parser’)

读取图书的各网点的报价

price_information_list = []
New_price_information_list = []
price_informations = soup.find_all(‘td’,class_=”pl2”)
for i in price_informations:
price_information = i.find(‘a’)
price_information_list.append(price_information)
price_information_list = list(filter(None,price_information_list))
for i in price_information_list:
New_price_information_list.append(i.string)

关闭网页

driver2.close()

比较那个网店的价格最便宜

min = New_price_information_list[1]
flag = 1
length = len(New_price_information_list)
for i in range(3,length):
if i%2 == 1:
if min > New_price_information_list[i]:
min = New_price_information_list[i]
flag = i

输出最便宜的网店名以及价格

print(“最便宜的网店名以及价格:%s:%s” %(New_price_information_list[flag-1],New_price_information_list[flag]))

创建一个workbook 设置编码

workbook = xlwt.Workbook(encoding = ‘utf-8’)

创建一个worksheet

worksheet = workbook.add_sheet(‘Book finder’)

写入excel

整合内容简介,作者简介

content_text = ‘’.join(about_content)
anthor_text = ‘’.join(about_anthor)

导入数据

worksheet.write(0, 0, label = ‘书名:’)
worksheet.write(0, 1, label = ‘%s’ %book)
length = len(New_book_list)
for i in range(0,length):
worksheet.write(i+1, 0, label = ‘%s’ %book_content_list[i])
worksheet.write(i+1, 1, label = ‘%s’ %New_book_list[i])
hang = length + 1
worksheet.write(hang, 0, label = ‘内容简介:’)
worksheet.write(hang, 1, label = ‘%s’ %content_text)
worksheet.write(hang+1, 0, label = ‘作者简介:’)
worksheet.write(hang+1, 1, label = ‘%s’ %anthor_text)
worksheet.write(hang+2, 0, label = ‘购买建议:’)
worksheet.write(hang+2, 1, label = ‘%s’ %New_price_information_list[flag-1])
worksheet.write(hang+2, 2, label = ‘%s’ %New_price_information_list[flag])

设置行列间隔

worksheet.col(0).width = 2569
worksheet.col(1).width = 256
22

保存

workbook.save(‘Book finder.xls’)

CATALOG
  1. 1. -- coding: utf-8 --
  2. 2. 模拟Firefox登陆
  3. 3. 获取用户输入
  4. 4. 查找并输入 id=”inp-query”是该url搜索框的id
  5. 5. 模拟点击搜索
  6. 6. 进入第一个相关链接
  7. 7. 保存第一个相关链接的url(即收集数据的url)
  8. 8. 对当前页面进行bs解析
  9. 9. 读取本书具有的信息的名称
  10. 10. 对当前页面进行修改并进行bs解析
  11. 11. 读取书名,作者,出版社,出版年,页数,定价等内容
  12. 12. 输出本书具有的信息
  13. 13. 对url进行bs解析
  14. 14. 读取内容简介,作者简介
  15. 15. 关闭网页
  16. 16. 输出
  17. 17. 确定购买的url
  18. 18. 动态爬取基础步骤(一:模仿为Firefox 二:读取网站)
  19. 19. 对buy_url进行bs解析
  20. 20. 读取图书的各网点的报价
  21. 21. 关闭网页
  22. 22. 比较那个网店的价格最便宜
  23. 23. 输出最便宜的网店名以及价格
  24. 24. 创建一个workbook 设置编码
  25. 25. 创建一个worksheet
  26. 26. 写入excel
  27. 27. 整合内容简介,作者简介
  28. 28. 导入数据
  29. 29. 设置行列间隔
  30. 30. 保存