-- coding: utf-8 --
import urllib.request
from bs4 import BeautifulSoup
import xlwt
url,伪装
url = “https://book.douban.com/"
headers = {‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36’}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
page_source = response.read().decode(‘utf-8’)
bs解析
soup = BeautifulSoup(page_source, ‘html.parser’)
爬取
links = soup.find_all(‘a’,class_=”name”,target=”_blank”)
存储
lists = []
for link in links:
lists.append(link.string)
输出
number = 0
for i in lists:
if(number == 0):
print(end = “京东:”)
print(end = “%s” %i)
print(end = “ “)
number += 1
if(number == 10):
print()
print(end = “当当:”)
创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = ‘utf-8’)
创建一个worksheet
worksheet = workbook.add_sheet(‘The New York Times Bestseller’)
写入excel
参数对应 行, 列, 值
for i in range(1,11):
worksheet.write(0, i, label=’第%d名’ % i)
worksheet.write(1,0, label = ‘京东’)
worksheet.write(2,0, label = ‘当当’)
计数,行,列
number = 0
row = 1
column = 1
for i in lists:
number += 1
if number == 11:
row = 2
column = 1
worksheet.write(row, column, label = i)
column += 1
设置列间隔
for i in range(1,11):
worksheet.col(i).width = 256*23
保存
workbook.save(‘The New York Times Bestseller.xls’)