爬虫基础

Qifei

学习笔记

发布于：2020年7月26日

次浏览

1: 任务介绍和准备工作

爬取豆瓣电影Top250的基本信息。

https://movie.douban.com/top250

编码规范：

#coding=utf-8

def hello():
	print("hello")

if __name__ = "__main__":	#入口函数，控制函数的执行顺序
	hello()

引入模块：

目录结构：

- test1

```

\- t1.py

\- test2


\- t2.py

t1.py

```python
def add(a,b):
	return a+b
print(add(a,b))

在t2.py中引入，t1.py

1 2	from test1 import t1 print(t1.add(3,5))

引入第三方模块

from bs4 import BeautifulSoup	#网页解析
import re	#正则表达式
import urllib.request,urllib.error	#指定url，获取网页数据
import xlwt	#进行excel操作
import sqlite3	#进行SQLite数据库操作

2：构建流程

2.1 基本框架

def main():
    baseurl = "https://movie.douban.com/top250?start="
    #1.爬取网页
    datalist = getData(baseurl)
    savepath = ".\\豆瓣电影Top250"
    #3.保存数据
	saveData(savepath)
  
#爬取网页
def getData(baseurl):
    datalist = []
    #2.逐一解析数据
    return datalist

#保存数据
def saveData(savepath):
    #保存

2.2 获取数据

def askURL(url):
    #用户代理，告诉豆瓣，浏览器类型
	head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"}
    req = urllib.request.Request(url,headers=head)
    html = ""
    try:
        response = urllib.request.urlospen(req)
        html = response.read().decode("utf-8")
        print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):s
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

def getData(baseurl):
    datalist = []
    for i in range(0,10):
        url = baseurl + str(i*25)
        html = askURL(url)
        #逐一解析网页
    return datalist

3：补充Urllib库

get请求

import urllib.request

response = urllib.request.urlopen("https://www.baidu.com")
print(response.read().decode('utf-8'))	#对获取到的源码进行解码

Post请求

import urllib.request
import urllib,parse

data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
response = urlib.request.urlopen("http://httpbin.org/post",data = data)
print(response.read().decode("utf-8"))

超时处理：0.01秒内无反应

try:
	response = urlib.request.urlopen("http://httpbin.org/get",timeout = 0.01)
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
    print("time out")

响应内容：

1
2
3

response.status
response.getheader()
response.getheader("Server")

伪装：headers内容可以在访问网站时查看，request header

url = "https://www.douban.com"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"}
data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8")
#封装请求
req = urllib.request.Request(url=url,data=data,headers=headers,method=method)
response = urllib.request.urlopen(req)

4：补充BeautifSoup库

将复杂的html文档转化为树形文档

from bs4 import BeautifulSoup
file = open("./baidu.html","rb")
html = file.read()
bs = BeautifulSoup(html,"html.parser")
print(bs.title)
# <title>The Dormouse's story</title>
print(bs.title.string)
#The Dormouse's story
print(bs.a.attrs)
#获取所有属性
#find_all()查找所有的a标签
t_lists = bs.find_all("a")
t_lists = bs.find_all(re.compile("a"))

t_lists = bs.find_all(class="head")
t_lists = bs.find_all(class_=True)
t_lists = bs.find_all(text = re.compile("\d"))	#查找带数字的文字
t_lists = bs.find_all("a",limit=3)	#限定三个元素

#Css选择器
bs.select('title')
#<title>sbianm<title>
bs.select(".mnav")
#按类名查找
bs.select("#u1")
#按id查找
bs.select("a[class='bri']")
#按照属性查找
bs.select("head>title")
#通过子标签来查找
bs.select(".mnv ~ .bri")
#查找.mnv类的兄弟标签，且类名为bri