梨视频高级爬取

April 3, 2020

2363 10 minutes and 44 seconds

[DeadPool爬虫]梨视频高级爬取

通过之前的梨视频初级爬取小实战，我们已经简单体会了Python写爬虫的威力，但是有没有感受到局限性？通过查看梨视频的网站，我们可以看到每个页面只加载12个短视频，因此每次爬取的视频内容量是非常不够的，所以我们要更进一步，是不是可以通过代码来实现控制页面加载更多呢？

PS: 如果您已经有爬虫相关知识了，可以直接看我的框架地址Deadpool项目

还是一样，话不多说，直接上最终代码效果~

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time    : 2020/4/3 14:16
@Author  : WangHuan
@Contact : hi_chengzi@126.com
@File    : 2020-04-03.py
@Software: PyCharm
@description: webdriver实现页面加载新数据后抓取信息并保存
"""

import os
import re
import time
import urllib.request
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By

def down_top10_techvideo():
    storage = 'videos'
    if storage not in os.listdir():
        os.mkdir(storage)

    seed_url = 'https://www.pearvideo.com/popular_8'
    prefix_url = 'https://www.pearvideo.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    }

    # 获取前十名视频的作者
    content = requests.get(seed_url, headers=headers).text
    author_regex = r'<a href="(.*?)" class="column">'
    author_ids = re.findall(author_regex, content)
    author_url = []
    for _ in author_ids:
        _url = prefix_url + _
        author_url.append(_url)

    videos = []
    driver = webdriver.Chrome(executable_path="E:\\Github\\Lesson\\chromedriver.exe")

    # 进入作者页面，获取作者页面下“最新”中的所有视频
    for _ in author_url:
        print('-' * 60)
        print(f'作者地址：{_}')

        # 访问作者页面
        driver.get(_)
        driver.implicitly_wait(10)
        temp_pagesource = driver.page_source

        # 匹配视频链接
        vids_regex = r'<a href="(.*?)" class="vervideo-lilink actplay" target="_blank">'
        vids = re.findall(vids_regex, temp_pagesource)
        # 剔除前六个热门视频
        # vids = vids[6:]
        for i in range(0, 6):
            vids.pop(0)
        for _ in vids:
            _url = prefix_url + _
            videos.append(_url)

        i = 1
        # 实现一直点击“加载更多”直到没有加载出新的内容
        while videos:

            print('当前队列里面有视频数量为：{0}'.format(len(videos)))

            # 访问videos中的视频地址，保存对应视频文件和相关描述信息，并将已保存过的视频链接从videos中去除
            videos.reverse()
            while videos:
                video = videos.pop()
                print(video)
                video_content = requests.get(video, headers=headers).text
                mp4_regex = r'ldUrl="",srcUrl="(.*?)",vdoUrl='
                video_url = re.findall(mp4_regex, video_content)
                title_regex = r'<h1 class="video-tt">(.*?)</h1>'
                video_title = re.findall(title_regex, video_content)

                md5 = hashlib.md5()
                md5.update(video_title[0].encode(encoding='utf-8'))
                md5_title = md5.hexdigest()

                print(f'正在下载视频：{video_title[0]}')
                urllib.request.urlretrieve(video_url[0], os.path.join(storage, f"{md5_title}.mp4"))

                sum_regex = r'<div class="summary">(.*?)</div>'
                video_sum = re.findall(sum_regex, video_content)
                with open(f"videos/{md5_title}.txt", 'w', encoding='utf-8') as f:
                    f.write(video_title[0])
                    f.write(video_sum[0])

            # 在作者页面点击“加载更多”，获取更多视频信息
            driver.find_element(By.ID, "listLoadMore").click()
            print(f'点击“加载更多” {i} 次')
            i += 1
            time.sleep(3)
            # 重新获取页面内容
            new_pagesource = driver.page_source
            # 获取新的vids
            new_vids = re.findall(vids_regex, new_pagesource)
            old_vids = re.findall(vids_regex, temp_pagesource)
            vids = (vid for vid in new_vids if vid not in old_vids)

            # 当没有新vids时，跳出循环
            if vids:
                temp_pagesource = new_pagesource
                for v in vids:
                    _url = prefix_url + v
                    videos.append(_url)
            else:
                break


if __name__ == '__main__':
    down_top10_techvideo()

虽然这次的代码里面混杂了一些还没有涉及的内容，但是先为了咱们的目标"不择手段"，以实现目标为最终目标，后续会介绍selenium的相关知识~

接下来还是一样的，代码解读时间，不过这次我们只讲解一些有差别的地方

===

这段代码是获取视频排行榜页面中的排名前十的作者地址

# 获取前十名视频的作者
content = requests.get(seed_url, headers=headers).text
author_regex = r'<a href="(.*?)" class="column">'
author_ids = re.findall(author_regex, content)
author_url = []
for _ in author_ids:
    _url = prefix_url + _
    author_url.append(_url)

===

这段代码是启动一个Chromedriver来打开浏览器进行页面访问

driver = webdriver.Chrome(executable_path="E:\\Github\\Lesson\\chromedriver.exe")

===

这段代码同上一个课程里面的最后的实战练习一样，将视频以MD5.mp4的形式进行保存，同时用一个MD5.txt的文件记录视频的相关信息

md5 = hashlib.md5()
md5.update(video_title[0].encode(encoding='utf-8'))
md5_title = md5.hexdigest()

print(f'正在下载视频：{video_title[0]}')
urllib.request.urlretrieve(video_url[0], os.path.join(storage, f"{md5_title}.mp4"))

sum_regex = r'<div class="summary">(.*?)</div>'
video_sum = re.findall(sum_regex, video_content)
with open(f"videos/{md5_title}.txt", 'w', encoding='utf-8') as f:
    f.write(video_title[0])
    f.write(video_sum[0])

===

因为我们是通过chrome浏览器打开的页面，所以我们可以通过chrome来触发点击动作

# 在作者页面点击“加载更多”，获取更多视频信息
driver.find_element(By.ID, "listLoadMore").click()
print(f'点击“加载更多” {i} 次')
i += 1
time.sleep(3)

===

这段代码是我们进行加载前后的两次内容对比，通过找到的链接列表来进行比对，找到新增的视频链接

# 重新获取页面内容
new_pagesource = driver.page_source
# 获取新的vids
new_vids = re.findall(vids_regex, new_pagesource)
old_vids = re.findall(vids_regex, temp_pagesource)
vids = (vid for vid in new_vids if vid not in old_vids)

===

最后当我们判断没有新增链接之后，就可以得知，该视频作者的所有短视频都爬取完毕了

# 当没有新vids时，跳出循环
if vids:
    temp_pagesource = new_pagesource
    for v in vids:
        _url = prefix_url + v
        videos.append(_url)
else:
    break

代码链接来自我女朋友的Github-2020-04-07

实战练习

通过上面的更高级一点的代码爬取的实战例子，我们再拔高一点？正好通过拔高例子的过程，再次巩固我们的代码实践能力~

目标是：通过一定的构建逻辑，将上面的实践代码例子，重构成一个可以复用的类对象

这里的重构代码由我亲自操刀

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time    : 2020/4/7 10:26
@Author  : WangHuan
@Contact : hi_chengzi@126.com
@File    : 2020-04-07.py
@Software: PyCharm
@description: 在梨视频科技排行榜中获取前十名视频的对应作者的全部视频内容
"""

import re
import os
import time
import requests
import hashlib
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By



class PearVideoSpider(object):
    """ 爬取梨视频分类中排行榜前十名视频作者的所有视频内容 """

    def __init__(self, seed):
        self.seed = seed
        self.prefix = "https://www.pearvideo.com/"
        self.temp_pagesource = ""
        self.finish_video_urls = set()
        self.storage = 'videos'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
        }
        self.author_urls = set()

        self._setup_dirver()

    def _setup_dirver(self):
        """ 加载谷歌浏览器内核并进行配置 """
        self.driver = webdriver.Chrome(executable_path="E:\\Github\\Lesson\\chromedriver.exe")

    def _click_more(self, elementid):
        """ 加载更多 """
        self.driver.find_element(By.ID, elementid).click()
        time.sleep(3)

    def setup_storage(self):
        """ 创建视频存储路径 """
        if self.storage not in os.listdir():
            os.mkdir(self.storage)

    def get_top10_creator(self, regex):
        """ 获取前十名视频的作者 """
        content = requests.get(self.seed, headers=self.headers).text
        author_ids = re.findall(regex, content)
        for _ in author_ids:
            _url = self.prefix + _
            self.author_urls.add(_url)

    def find_video_urls(self, pagesource):
        """ 查找页面里面视频的链接地址 """
        # 匹配视频链接
        videos = []
        vids_regex = r'<a href="(.*?)" class="vervideo-lilink actplay" target="_blank">'
        vids = re.findall(vids_regex, pagesource)
        for _ in vids:
            _url = self.prefix + _
            videos.append(_url)
        videos.reverse()
        return videos

    def download_video(self, item):
        """ 下载每一个视频并进行存储 """
        video_content = requests.get(item, headers=self.headers).text
        mp4_regex = r'ldUrl="",srcUrl="(.*?)",vdoUrl='
        video_url = re.findall(mp4_regex, video_content)
        title_regex = r'<h1 class="video-tt">(.*?)</h1>'
        video_title = re.findall(title_regex, video_content)

        md5 = hashlib.md5()
        md5.update(video_title[0].encode(encoding='utf-8'))
        md5_title = md5.hexdigest()

        print(f'正在下载视频：{video_title[0]}')
        urllib.request.urlretrieve(video_url[0], os.path.join(self.storage, f"{md5_title}.mp4"))

        sum_regex = r'<div class="summary">(.*?)</div>'
        video_sum = re.findall(sum_regex, video_content)
        with open(f"{self.storage}/{md5_title}.txt", 'w', encoding='utf-8') as f:
            f.write(video_title[0])
            f.write(video_sum[0])

    def run(self):
        self.setup_storage()
        self.get_top10_creator(regex=r'<a href="(.*?)" class="column">')

        # 进入作者页面，获取作者页面下“最新”中的所有视频
        print(f'当前类别视频排名前十的视频作者有{len(self.author_urls)}个')
        for _ in self.author_urls:
            print('-' * 60)
            print(f'作者地址：{_}')
            # 访问作者页面
            self.driver.get(_)
            self.driver.implicitly_wait(10)
            self.temp_pagesource = self.driver.page_source

            # 匹配视频链接
            videos = self.find_video_urls(pagesource=self.temp_pagesource)
            videos = videos[:-6]

            i = 1
            # 实现一直点击“加载更多”直到没有加载出新的内容
            while videos:
                print(f'当前队列里面有视频数量为: {len(videos)}')
                # 访问videos中的视频地址，保存对应视频文件和相关描述信息，并将已保存过的视频链接从videos中去除
                while videos:
                    video = videos.pop()
                    print(f'当前爬取的视频地址: {video}')
                    self.download_video(video)

                # 在作者页面点击“加载更多”，获取更多视频信息
                print(f'点击“加载更多” {i} 次')
                self._click_more("listLoadMore")
                i += 1

                # 重新获取页面内容
                new_pagesource = self.driver.page_source
                # 获取新的vids
                new_vids = self.find_video_urls(new_pagesource)
                old_vids = self.find_video_urls(self.temp_pagesource)
                vids = (vid for vid in new_vids if vid not in old_vids)

                # 当没有新vids时，跳出循环
                if vids:
                    temp_pagesource = new_pagesource
                    videos = vids
                else:
                    break