python爬虫模拟登录并爬取 GitHub

模拟登录并爬取GitHub

实现模拟登录GitHub并获取页面信息,如好友动态、个人信息等内容。

技术路线:requests库  pyquery库

#1.先定义一个Login类,并初始化一些变量,使用requests库的session开启一个会话,且会自动处理cookies

def __init__(self):
    self.headers = {
        'Referer': 'https://github.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
        'Host': 'github.com'
    }
    self.login_url = 'https://github.com/login'
    self.post_url = 'https://github.com/session'
    self.feed_url = 'https://github.com/dashboard-feed'
    self.logined_url = 'https://github.com/settings/profile'
    ## 维持会话,自动处理cookies
    self.session = requests.Session()

#2.用 Session 对象的 get() 方法访问 GitHub 登录页面,用 pyquery解析出登录所需的 authenticity_token 信息并返回

## 解析出登录所需要的
def token(self):
    response = self.session.get(self.login_url, headers=self.headers)
    selector = pq(response.text)
    token = selector('input[name="authenticity_token"]').attr('value')
    return token

#3.首先构造一个表单处理信息,然后用 Session 对象的 post() 方法模拟登录。得到响应之后我们用 dynamics() 方法来对其进行处理。接下来再用 Session 对象请求个人详情页,然后用 profile() 方法来处理个人详情页信息。

def login(self, email, password):
    post_data = {
        'commit': 'Sign in',
        'utf8': '✓',
        'authenticity_token': self.token(),
        'login': email,
        'password': password
    }
    response = self.session.post(self.post_url, data=post_data, headers=self.headers)
    response = self.session.get(self.feed_url, headers=self.headers)
    if response.status_code == 200:
        self.dynamics(response.text)
    response = self.session.get(self.logined_url, headers=self.headers)
    if response.status_code == 200:
        self.profile(response.text)

#4.这里仍然使用pyquery进行解析获取信息。通过dynamics方法获取到个人动态信息,profile方法获取个人的name,email,site,location信息

## 关注动态信息
def dynamics(self, html):
    selector = pq(html)
    dynamics = selector('div[class="d-flex flex-items-baseline"] div')
    dynamics.find('span').remove()
    for item in dynamics.items():
        dynamic = item.text().strip()
        print(dynamic)

## 详情页面
def profile(self, html):
    selector = pq(html)
    name = selector('input[id="user_profile_name"]').attr('value')
    email = selector('select[id="user_profile_email"] option[selected="selected"]').text()
    site=selector('input[id="user_profile_blog"]').attr('value')
    location=selector('input[id="user_profile_location"]').attr('value')
    print(name, email,site,location)

#5.最后新建一个 Login 对象,然后运行程序

if __name__ == "__main__":
    login = Login()
    email=input("请输入你的github账号")
    password=input("请输入你的密码")
    login.login(email=email, password=password)

完整代码

import requests
from pyquery import PyQuery as pq


class Login(object):
    def __init__(self):
        self.headers = {
            'Referer': 'https://github.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'Host': 'github.com'
        }
        self.login_url = 'https://github.com/login'
        self.post_url = 'https://github.com/session'
        self.feed_url = 'https://github.com/dashboard-feed'
        self.logined_url = 'https://github.com/settings/profile'
        ## 维持会话,自动处理cookies
        self.session = requests.Session()

    ## 解析出登录所需要的
    def token(self):
        response = self.session.get(self.login_url, headers=self.headers)
        selector = pq(response.text)
        token = selector('input[name="authenticity_token"]').attr('value')
        return token

    def login(self, email, password):
        post_data = {
            'commit': 'Sign in',
            'utf8': '✓',
            'authenticity_token': self.token(),
            'login': email,
            'password': password
        }
        response = self.session.post(self.post_url, data=post_data, headers=self.headers)
        response = self.session.get(self.feed_url, headers=self.headers)
        if response.status_code == 200:
            self.dynamics(response.text)
        response = self.session.get(self.logined_url, headers=self.headers)
        if response.status_code == 200:
            self.profile(response.text)

    ## 关注动态信息
    def dynamics(self, html):
        selector = pq(html)
        dynamics = selector('div[class="d-flex flex-items-baseline"] div')
        dynamics.find('span').remove()
        for item in dynamics.items():
            dynamic = item.text().strip()
            print(dynamic)

    ## 详情页面
    def profile(self, html):
        selector = pq(html)
        name = selector('input[id="user_profile_name"]').attr('value')
        email = selector('select[id="user_profile_email"] option[selected="selected"]').text()
        site=selector('input[id="user_profile_blog"]').attr('value')
        location=selector('input[id="user_profile_location"]').attr('value')
        print(name, email,site,location)

if __name__ == "__main__":
    login = Login()
    email=input("请输入你的github账号")
    password=input("请输入你的密码")
    login.login(email=email, password=password)

#输出结果:

DanLCJ forked DanLCJ/api.bilibili from dateolive/api.bilibili
tigercat123 starred dateolive/mdy-blog
DanLCJ started following you
hrdate started following you
hrdate starred dateolive/mdy-blog
zjztsinghua started following you
zjztsinghua starred dateolive/mdy-blog
hairrrrr starred dateolive/dateolive.github.io
GeophyAI forked GeophyAI/dateolive.github.io from dateolive/dateolive.github.io
梦独吟 2448282543@qq.com www.datealive.top 广东广州
点赞

发表评论

昵称和uid可以选填一个,填邮箱必填(留言回复后将会发邮件给你)
tips:输入uid可以快速获得你的昵称和头像