模拟登录并爬取GitHub
实现模拟登录GitHub并获取页面信息,如好友动态、个人信息等内容。
技术路线:requests库 pyquery库
#1.先定义一个Login类,并初始化一些变量,使用requests库的session开启一个会话,且会自动处理cookies
def __init__(self):
self.headers = {
'Referer': 'https://github.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Host': 'github.com'
}
self.login_url = 'https://github.com/login'
self.post_url = 'https://github.com/session'
self.feed_url = 'https://github.com/dashboard-feed'
self.logined_url = 'https://github.com/settings/profile'
## 维持会话,自动处理cookies
self.session = requests.Session()
#2.用 Session 对象的 get() 方法访问 GitHub 登录页面,用 pyquery解析出登录所需的 authenticity_token 信息并返回
## 解析出登录所需要的
def token(self):
response = self.session.get(self.login_url, headers=self.headers)
selector = pq(response.text)
token = selector('input[name="authenticity_token"]').attr('value')
return token
#3.首先构造一个表单处理信息,然后用 Session 对象的 post() 方法模拟登录。得到响应之后我们用 dynamics() 方法来对其进行处理。接下来再用 Session 对象请求个人详情页,然后用 profile() 方法来处理个人详情页信息。
def login(self, email, password):
post_data = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': self.token(),
'login': email,
'password': password
}
response = self.session.post(self.post_url, data=post_data, headers=self.headers)
response = self.session.get(self.feed_url, headers=self.headers)
if response.status_code == 200:
self.dynamics(response.text)
response = self.session.get(self.logined_url, headers=self.headers)
if response.status_code == 200:
self.profile(response.text)
#4.这里仍然使用pyquery进行解析获取信息。通过dynamics方法获取到个人动态信息,profile方法获取个人的name,email,site,location信息
## 关注动态信息
def dynamics(self, html):
selector = pq(html)
dynamics = selector('div[class="d-flex flex-items-baseline"] div')
dynamics.find('span').remove()
for item in dynamics.items():
dynamic = item.text().strip()
print(dynamic)
## 详情页面
def profile(self, html):
selector = pq(html)
name = selector('input[id="user_profile_name"]').attr('value')
email = selector('select[id="user_profile_email"] option[selected="selected"]').text()
site=selector('input[id="user_profile_blog"]').attr('value')
location=selector('input[id="user_profile_location"]').attr('value')
print(name, email,site,location)
#5.最后新建一个 Login 对象,然后运行程序
if __name__ == "__main__":
login = Login()
email=input("请输入你的github账号")
password=input("请输入你的密码")
login.login(email=email, password=password)
完整代码
import requests
from pyquery import PyQuery as pq
class Login(object):
def __init__(self):
self.headers = {
'Referer': 'https://github.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Host': 'github.com'
}
self.login_url = 'https://github.com/login'
self.post_url = 'https://github.com/session'
self.feed_url = 'https://github.com/dashboard-feed'
self.logined_url = 'https://github.com/settings/profile'
## 维持会话,自动处理cookies
self.session = requests.Session()
## 解析出登录所需要的
def token(self):
response = self.session.get(self.login_url, headers=self.headers)
selector = pq(response.text)
token = selector('input[name="authenticity_token"]').attr('value')
return token
def login(self, email, password):
post_data = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': self.token(),
'login': email,
'password': password
}
response = self.session.post(self.post_url, data=post_data, headers=self.headers)
response = self.session.get(self.feed_url, headers=self.headers)
if response.status_code == 200:
self.dynamics(response.text)
response = self.session.get(self.logined_url, headers=self.headers)
if response.status_code == 200:
self.profile(response.text)
## 关注动态信息
def dynamics(self, html):
selector = pq(html)
dynamics = selector('div[class="d-flex flex-items-baseline"] div')
dynamics.find('span').remove()
for item in dynamics.items():
dynamic = item.text().strip()
print(dynamic)
## 详情页面
def profile(self, html):
selector = pq(html)
name = selector('input[id="user_profile_name"]').attr('value')
email = selector('select[id="user_profile_email"] option[selected="selected"]').text()
site=selector('input[id="user_profile_blog"]').attr('value')
location=selector('input[id="user_profile_location"]').attr('value')
print(name, email,site,location)
if __name__ == "__main__":
login = Login()
email=input("请输入你的github账号")
password=input("请输入你的密码")
login.login(email=email, password=password)
#输出结果:
DanLCJ forked DanLCJ/api.bilibili from dateolive/api.bilibili
tigercat123 starred dateolive/mdy-blog
DanLCJ started following you
hrdate started following you
hrdate starred dateolive/mdy-blog
zjztsinghua started following you
zjztsinghua starred dateolive/mdy-blog
hairrrrr starred dateolive/dateolive.github.io
GeophyAI forked GeophyAI/dateolive.github.io from dateolive/dateolive.github.io
梦独吟 2448282543@qq.com www.datealive.top 广东广州