-- coding: utf-8 --
from bs4 import BeautifulSoup
import re
import xlwt
from selenium import webdriver
让用户输入想要爬取的标签
url1 = “https://book.douban.com/tag/"
url2 = input(“Please enter the book undder what label you want to crawl:”)
url = url1 + str(url2)
动态爬取基础步骤(一:模仿为Firefox 二:读取网站)
driver = webdriver.Firefox()
driver.get(url)
bs解析
soup = BeautifulSoup(driver.page_source, ‘html.parser’)
获取最后一页的页码
total_pages = soup.find_all(‘a’,href = re.compile(r”/tag/%s” %url2))
page = []
for i in total_pages:
page.append(i.string)
last_page = int(page[9])
driver.close()
翻页,爬取,存储
测试时将 last_page = 2 复制入下一行,即可测试,爬取两页的效果
list_title = []
for i in range(0,last_page):
driver2 = webdriver.Firefox()
url1 = “?start=”
url2 = “&type=T”
page_url = url + url1 + str(i*20) + url2
driver2.get(page_url)
soup_page = BeautifulSoup(driver2.page_source, ‘html.parser’)
links = soup_page.find_all(‘a’,href = re.compile(r”https://book.douban.com/subject/"),title=re.compile(r""),onclick = re.compile(r”moreurl”))
driver2.close()
for link in links:
list_title.append(link[“title”])
输出
for i in list_title:
print(i)
创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = ‘utf-8’)
创建一个worksheet 如果下一行报错可能时因为名称太长,将名称改为’1’即可
worksheet = workbook.add_sheet(‘Crawl all the books corresponding to the label’)
写入excel
参数对应 行, 列, 值
for i in range(1,last_page+1):
worksheet.write(0, i, label = ‘第%d页’ %i)
for i in range(1,21):
worksheet.write(i, 0, label = ‘本页第%d名’ %i)
计数,行,列
number = 0
row = 1
column = 0
导入数据
for i in list_title:
number += 1
if number%20 == 1:
row = 1
column += 1
worksheet.write(row, column, label = i)
row += 1
设置列间隔
worksheet.col(0).width = 25611
for i in range(1,last_page+1):
worksheet.col(i).width = 25623
保存
workbook.save(‘Crawl all the books corresponding to the label.xls’)