Python请求和持久会话

问题:Python请求和持久会话

我正在使用请求模块(Python 2.5的版本0.10.0)。我已经找到了如何将数据提交到网站上的登录表单并检索会话密钥的方法,但是我看不到在后续请求中使用此会话密钥的明显方法。有人可以在下面的代码中填写省略号还是建议其他方法?

>>> import requests
>>> login_data =  {'formPosted':'1', 'login_email':'me@example.com', 'password':'pw'}
>>> r = requests.post('https://localhost/login.py', login_data)
>>> 
>>> r.text
u'You are being redirected <a href="profilePage?_ck=1349394964">here</a>'
>>> r.cookies
{'session_id_myapp': '127-0-0-1-825ff22a-6ed1-453b-aebc-5d3cf2987065'}
>>> 
>>> r2 = requests.get('https://localhost/profile_data.json', ...)

I am using the requests module (version 0.10.0 with Python 2.5). I have figured out how to submit data to a login form on a website and retrieve the session key, but I can’t see an obvious way to use this session key in subsequent requests. Can someone fill in the ellipsis in the code below or suggest another approach?

>>> import requests
>>> login_data =  {'formPosted':'1', 'login_email':'me@example.com', 'password':'pw'}
>>> r = requests.post('https://localhost/login.py', login_data)
>>> 
>>> r.text
u'You are being redirected <a href="profilePage?_ck=1349394964">here</a>'
>>> r.cookies
{'session_id_myapp': '127-0-0-1-825ff22a-6ed1-453b-aebc-5d3cf2987065'}
>>> 
>>> r2 = requests.get('https://localhost/profile_data.json', ...)

回答 0

您可以使用以下方法轻松创建持久会话:

s = requests.Session()

之后,继续执行您的请求,如下所示:

s.post('https://localhost/login.py', login_data)
#logged in! cookies saved for future requests.
r2 = s.get('https://localhost/profile_data.json', ...)
#cookies sent automatically!
#do whatever, s will keep your cookies intact :)

有关会话的更多信息:https : //requests.kennethreitz.org/en/master/user/advanced/#session-objects

You can easily create a persistent session using:

s = requests.Session()

After that, continue with your requests as you would:

s.post('https://localhost/login.py', login_data)
#logged in! cookies saved for future requests.
r2 = s.get('https://localhost/profile_data.json', ...)
#cookies sent automatically!
#do whatever, s will keep your cookies intact :)

For more about sessions: https://requests.kennethreitz.org/en/master/user/advanced/#session-objects


回答 1

其他答案有助于了解如何维护此类会话。另外,我想提供一个类,该类可以使会话在脚本的不同运行(带有缓存文件)上维护。这意味着仅在需要时才执行正确的“登录”(超时或缓存中不存在会话)。它还支持在随后的“ get”或“ post”调用中的代理设置。

已通过Python3测试。

使用它作为您自己的代码的基础。以下代码段随GPL v3一起发布

import pickle
import datetime
import os
from urllib.parse import urlparse
import requests    

class MyLoginSession:
    """
    a class which handles and saves login sessions. It also keeps track of proxy settings.
    It does also maintine a cache-file for restoring session data from earlier
    script executions.
    """
    def __init__(self,
                 loginUrl,
                 loginData,
                 loginTestUrl,
                 loginTestString,
                 sessionFileAppendix = '_session.dat',
                 maxSessionTimeSeconds = 30 * 60,
                 proxies = None,
                 userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
                 debug = True,
                 forceLogin = False,
                 **kwargs):
        """
        save some information needed to login the session

        you'll have to provide 'loginTestString' which will be looked for in the
        responses html to make sure, you've properly been logged in

        'proxies' is of format { 'https' : 'https://user:pass@server:port', 'http' : ...
        'loginData' will be sent as post data (dictionary of id : value).
        'maxSessionTimeSeconds' will be used to determine when to re-login.
        """
        urlData = urlparse(loginUrl)

        self.proxies = proxies
        self.loginData = loginData
        self.loginUrl = loginUrl
        self.loginTestUrl = loginTestUrl
        self.maxSessionTime = maxSessionTimeSeconds
        self.sessionFile = urlData.netloc + sessionFileAppendix
        self.userAgent = userAgent
        self.loginTestString = loginTestString
        self.debug = debug

        self.login(forceLogin, **kwargs)

    def modification_date(self, filename):
        """
        return last file modification date as datetime object
        """
        t = os.path.getmtime(filename)
        return datetime.datetime.fromtimestamp(t)

    def login(self, forceLogin = False, **kwargs):
        """
        login to a session. Try to read last saved session from cache file. If this fails
        do proper login. If the last cache access was too old, also perform a proper login.
        Always updates session cache file.
        """
        wasReadFromCache = False
        if self.debug:
            print('loading or generating session...')
        if os.path.exists(self.sessionFile) and not forceLogin:
            time = self.modification_date(self.sessionFile)         

            # only load if file less than 30 minutes old
            lastModification = (datetime.datetime.now() - time).seconds
            if lastModification < self.maxSessionTime:
                with open(self.sessionFile, "rb") as f:
                    self.session = pickle.load(f)
                    wasReadFromCache = True
                    if self.debug:
                        print("loaded session from cache (last access %ds ago) "
                              % lastModification)
        if not wasReadFromCache:
            self.session = requests.Session()
            self.session.headers.update({'user-agent' : self.userAgent})
            res = self.session.post(self.loginUrl, data = self.loginData, 
                                    proxies = self.proxies, **kwargs)

            if self.debug:
                print('created new session with login' )
            self.saveSessionToCache()

        # test login
        res = self.session.get(self.loginTestUrl)
        if res.text.lower().find(self.loginTestString.lower()) < 0:
            raise Exception("could not log into provided site '%s'"
                            " (did not find successful login string)"
                            % self.loginUrl)

    def saveSessionToCache(self):
        """
        save session to a cache file
        """
        # always save (to update timeout)
        with open(self.sessionFile, "wb") as f:
            pickle.dump(self.session, f)
            if self.debug:
                print('updated session cache-file %s' % self.sessionFile)

    def retrieveContent(self, url, method = "get", postData = None, **kwargs):
        """
        return the content of the url with respect to the session.

        If 'method' is not 'get', the url will be called with 'postData'
        as a post request.
        """
        if method == 'get':
            res = self.session.get(url , proxies = self.proxies, **kwargs)
        else:
            res = self.session.post(url , data = postData, proxies = self.proxies, **kwargs)

        # the session has been updated on the server, so also update in cache
        self.saveSessionToCache()            

        return res

使用上述类的代码片段可能如下所示:

if __name__ == "__main__":
    # proxies = {'https' : 'https://user:pass@server:port',
    #           'http' : 'http://user:pass@server:port'}

    loginData = {'user' : 'usr',
                 'password' :  'pwd'}

    loginUrl = 'https://...'
    loginTestUrl = 'https://...'
    successStr = 'Hello Tom'
    s = MyLoginSession(loginUrl, loginData, loginTestUrl, successStr, 
                       #proxies = proxies
                       )

    res = s.retrieveContent('https://....')
    print(res.text)

    # if, for instance, login via JSON values required try this:
    s = MyLoginSession(loginUrl, None, loginTestUrl, successStr, 
                       #proxies = proxies,
                       json = loginData)

the other answers help to understand how to maintain such a session. Additionally, I want to provide a class which keeps the session maintained over different runs of a script (with a cache file). This means a proper “login” is only performed when required (timout or no session exists in cache). Also it supports proxy settings over subsequent calls to ‘get’ or ‘post’.

It is tested with Python3.

Use it as a basis for your own code. The following snippets are release with GPL v3

import pickle
import datetime
import os
from urllib.parse import urlparse
import requests    

class MyLoginSession:
    """
    a class which handles and saves login sessions. It also keeps track of proxy settings.
    It does also maintine a cache-file for restoring session data from earlier
    script executions.
    """
    def __init__(self,
                 loginUrl,
                 loginData,
                 loginTestUrl,
                 loginTestString,
                 sessionFileAppendix = '_session.dat',
                 maxSessionTimeSeconds = 30 * 60,
                 proxies = None,
                 userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
                 debug = True,
                 forceLogin = False,
                 **kwargs):
        """
        save some information needed to login the session

        you'll have to provide 'loginTestString' which will be looked for in the
        responses html to make sure, you've properly been logged in

        'proxies' is of format { 'https' : 'https://user:pass@server:port', 'http' : ...
        'loginData' will be sent as post data (dictionary of id : value).
        'maxSessionTimeSeconds' will be used to determine when to re-login.
        """
        urlData = urlparse(loginUrl)

        self.proxies = proxies
        self.loginData = loginData
        self.loginUrl = loginUrl
        self.loginTestUrl = loginTestUrl
        self.maxSessionTime = maxSessionTimeSeconds
        self.sessionFile = urlData.netloc + sessionFileAppendix
        self.userAgent = userAgent
        self.loginTestString = loginTestString
        self.debug = debug

        self.login(forceLogin, **kwargs)

    def modification_date(self, filename):
        """
        return last file modification date as datetime object
        """
        t = os.path.getmtime(filename)
        return datetime.datetime.fromtimestamp(t)

    def login(self, forceLogin = False, **kwargs):
        """
        login to a session. Try to read last saved session from cache file. If this fails
        do proper login. If the last cache access was too old, also perform a proper login.
        Always updates session cache file.
        """
        wasReadFromCache = False
        if self.debug:
            print('loading or generating session...')
        if os.path.exists(self.sessionFile) and not forceLogin:
            time = self.modification_date(self.sessionFile)         

            # only load if file less than 30 minutes old
            lastModification = (datetime.datetime.now() - time).seconds
            if lastModification < self.maxSessionTime:
                with open(self.sessionFile, "rb") as f:
                    self.session = pickle.load(f)
                    wasReadFromCache = True
                    if self.debug:
                        print("loaded session from cache (last access %ds ago) "
                              % lastModification)
        if not wasReadFromCache:
            self.session = requests.Session()
            self.session.headers.update({'user-agent' : self.userAgent})
            res = self.session.post(self.loginUrl, data = self.loginData, 
                                    proxies = self.proxies, **kwargs)

            if self.debug:
                print('created new session with login' )
            self.saveSessionToCache()

        # test login
        res = self.session.get(self.loginTestUrl)
        if res.text.lower().find(self.loginTestString.lower()) < 0:
            raise Exception("could not log into provided site '%s'"
                            " (did not find successful login string)"
                            % self.loginUrl)

    def saveSessionToCache(self):
        """
        save session to a cache file
        """
        # always save (to update timeout)
        with open(self.sessionFile, "wb") as f:
            pickle.dump(self.session, f)
            if self.debug:
                print('updated session cache-file %s' % self.sessionFile)

    def retrieveContent(self, url, method = "get", postData = None, **kwargs):
        """
        return the content of the url with respect to the session.

        If 'method' is not 'get', the url will be called with 'postData'
        as a post request.
        """
        if method == 'get':
            res = self.session.get(url , proxies = self.proxies, **kwargs)
        else:
            res = self.session.post(url , data = postData, proxies = self.proxies, **kwargs)

        # the session has been updated on the server, so also update in cache
        self.saveSessionToCache()            

        return res

A code snippet for using the above class may look like this:

if __name__ == "__main__":
    # proxies = {'https' : 'https://user:pass@server:port',
    #           'http' : 'http://user:pass@server:port'}

    loginData = {'user' : 'usr',
                 'password' :  'pwd'}

    loginUrl = 'https://...'
    loginTestUrl = 'https://...'
    successStr = 'Hello Tom'
    s = MyLoginSession(loginUrl, loginData, loginTestUrl, successStr, 
                       #proxies = proxies
                       )

    res = s.retrieveContent('https://....')
    print(res.text)

    # if, for instance, login via JSON values required try this:
    s = MyLoginSession(loginUrl, None, loginTestUrl, successStr, 
                       #proxies = proxies,
                       json = loginData)

回答 2

在这个类似的问题中查看我的答案:

python:urllib2如何使用urlopen请求发送cookie

import urllib2
import urllib
from cookielib import CookieJar

cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# input-type values from the html form
formdata = { "username" : username, "password": password, "form-id" : "1234" }
data_encoded = urllib.urlencode(formdata)
response = opener.open("https://page.com/login.php", data_encoded)
content = response.read()

编辑:

我看到我的回答有些不满意,但没有解释性的评论。我猜是因为我指的是urllib库而不是requests。我这样做是因为OP寻求帮助requests或有人建议其他方法。

Check out my answer in this similar question:

python: urllib2 how to send cookie with urlopen request

import urllib2
import urllib
from cookielib import CookieJar

cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# input-type values from the html form
formdata = { "username" : username, "password": password, "form-id" : "1234" }
data_encoded = urllib.urlencode(formdata)
response = opener.open("https://page.com/login.php", data_encoded)
content = response.read()

EDIT:

I see I’ve gotten a few downvotes for my answer, but no explaining comments. I’m guessing it’s because I’m referring to the urllib libraries instead of requests. I do that because the OP asks for help with requests or for someone to suggest another approach.


回答 3

文档说这get是可选的cookies参数,允许您指定要使用的cookie:

从文档:

>>> url = 'http://httpbin.org/cookies'
>>> cookies = dict(cookies_are='working')

>>> r = requests.get(url, cookies=cookies)
>>> r.text
'{"cookies": {"cookies_are": "working"}}'

http://docs.python-requests.org/zh-CN/latest/user/quickstart/#cookies

The documentation says that get takes in an optional cookies argument allowing you to specify cookies to use:

from the docs:

>>> url = 'http://httpbin.org/cookies'
>>> cookies = dict(cookies_are='working')

>>> r = requests.get(url, cookies=cookies)
>>> r.text
'{"cookies": {"cookies_are": "working"}}'

http://docs.python-requests.org/en/latest/user/quickstart/#cookies


回答 4

尝试了以上所有答案后,我发现使用“ RequestsCookieJar”代替常规的CookieJar处理后续请求可以解决我的问题。

import requests
import json

# The Login URL
authUrl = 'https://whatever.com/login'

# The subsequent URL
testUrl = 'https://whatever.com/someEndpoint'

# Logout URL
testlogoutUrl = 'https://whatever.com/logout'

# Whatever you are posting
login_data =  {'formPosted':'1', 
               'login_email':'me@example.com', 
               'password':'pw'
               }

# The Authentication token or any other data that we will receive from the Authentication Request. 
token = ''

# Post the login Request
loginRequest = requests.post(authUrl, login_data)
print("{}".format(loginRequest.text))

# Save the request content to your variable. In this case I needed a field called token. 
token = str(json.loads(loginRequest.content)['token'])  # or ['access_token']
print("{}".format(token))

# Verify Successful login
print("{}".format(loginRequest.status_code))

# Create your Requests Cookie Jar for your subsequent requests and add the cookie
jar = requests.cookies.RequestsCookieJar()
jar.set('LWSSO_COOKIE_KEY', token)

# Execute your next request(s) with the Request Cookie Jar set
r = requests.get(testUrl, cookies=jar)
print("R.TEXT: {}".format(r.text))
print("R.STCD: {}".format(r.status_code))

# Execute your logout request(s) with the Request Cookie Jar set
r = requests.delete(testlogoutUrl, cookies=jar)
print("R.TEXT: {}".format(r.text))  # should show "Request Not Authorized"
print("R.STCD: {}".format(r.status_code))  # should show 401

Upon trying all the answers above, I found that using “RequestsCookieJar” instead of the regular CookieJar for subsequent requests fixed my problem.

import requests
import json

# The Login URL
authUrl = 'https://whatever.com/login'

# The subsequent URL
testUrl = 'https://whatever.com/someEndpoint'

# Logout URL
testlogoutUrl = 'https://whatever.com/logout'

# Whatever you are posting
login_data =  {'formPosted':'1', 
               'login_email':'me@example.com', 
               'password':'pw'
               }

# The Authentication token or any other data that we will receive from the Authentication Request. 
token = ''

# Post the login Request
loginRequest = requests.post(authUrl, login_data)
print("{}".format(loginRequest.text))

# Save the request content to your variable. In this case I needed a field called token. 
token = str(json.loads(loginRequest.content)['token'])  # or ['access_token']
print("{}".format(token))

# Verify Successful login
print("{}".format(loginRequest.status_code))

# Create your Requests Cookie Jar for your subsequent requests and add the cookie
jar = requests.cookies.RequestsCookieJar()
jar.set('LWSSO_COOKIE_KEY', token)

# Execute your next request(s) with the Request Cookie Jar set
r = requests.get(testUrl, cookies=jar)
print("R.TEXT: {}".format(r.text))
print("R.STCD: {}".format(r.status_code))

# Execute your logout request(s) with the Request Cookie Jar set
r = requests.delete(testlogoutUrl, cookies=jar)
print("R.TEXT: {}".format(r.text))  # should show "Request Not Authorized"
print("R.STCD: {}".format(r.status_code))  # should show 401

回答 5

片段以获取受密码保护的json数据

import requests

username = "my_user_name"
password = "my_super_secret"
url = "https://www.my_base_url.com"
the_page_i_want = "/my_json_data_page"

session = requests.Session()
# retrieve cookie value
resp = session.get(url+'/login')
csrf_token = resp.cookies['csrftoken']
# login, add referer
resp = session.post(url+"/login",
                  data={
                      'username': username,
                      'password': password,
                      'csrfmiddlewaretoken': csrf_token,
                      'next': the_page_i_want,
                  },
                  headers=dict(Referer=url+"/login"))
print(resp.json())

snippet to retrieve json data, password protected

import requests

username = "my_user_name"
password = "my_super_secret"
url = "https://www.my_base_url.com"
the_page_i_want = "/my_json_data_page"

session = requests.Session()
# retrieve cookie value
resp = session.get(url+'/login')
csrf_token = resp.cookies['csrftoken']
# login, add referer
resp = session.post(url+"/login",
                  data={
                      'username': username,
                      'password': password,
                      'csrfmiddlewaretoken': csrf_token,
                      'next': the_page_i_want,
                  },
                  headers=dict(Referer=url+"/login"))
print(resp.json())

回答 6

仅保存所需的cookie并重复使用。

import os
import pickle
from urllib.parse import urljoin, urlparse

login = 'my@email.com'
password = 'secret'
# Assuming two cookies are used for persistent login.
# (Find it by tracing the login process)
persistentCookieNames = ['sessionId', 'profileId']
URL = 'http://example.com'
urlData = urlparse(URL)
cookieFile = urlData.netloc + '.cookie'
signinUrl = urljoin(URL, "/signin")
with requests.Session() as session:
    try:
        with open(cookieFile, 'rb') as f:
            print("Loading cookies...")
            session.cookies.update(pickle.load(f))
    except Exception:
        # If could not load cookies from file, get the new ones by login in
        print("Login in...")
        post = session.post(
            signinUrl,
            data={
                'email': login,
                'password': password,
            }
        )
        try:
            with open(cookieFile, 'wb') as f:
                jar = requests.cookies.RequestsCookieJar()
                for cookie in session.cookies:
                    if cookie.name in persistentCookieNames:
                        jar.set_cookie(cookie)
                pickle.dump(jar, f)
        except Exception as e:
            os.remove(cookieFile)
            raise(e)
    MyPage = urljoin(URL, "/mypage")
    page = session.get(MyPage)

Save only required cookies and reuse them.

import os
import pickle
from urllib.parse import urljoin, urlparse

login = 'my@email.com'
password = 'secret'
# Assuming two cookies are used for persistent login.
# (Find it by tracing the login process)
persistentCookieNames = ['sessionId', 'profileId']
URL = 'http://example.com'
urlData = urlparse(URL)
cookieFile = urlData.netloc + '.cookie'
signinUrl = urljoin(URL, "/signin")
with requests.Session() as session:
    try:
        with open(cookieFile, 'rb') as f:
            print("Loading cookies...")
            session.cookies.update(pickle.load(f))
    except Exception:
        # If could not load cookies from file, get the new ones by login in
        print("Login in...")
        post = session.post(
            signinUrl,
            data={
                'email': login,
                'password': password,
            }
        )
        try:
            with open(cookieFile, 'wb') as f:
                jar = requests.cookies.RequestsCookieJar()
                for cookie in session.cookies:
                    if cookie.name in persistentCookieNames:
                        jar.set_cookie(cookie)
                pickle.dump(jar, f)
        except Exception as e:
            os.remove(cookieFile)
            raise(e)
    MyPage = urljoin(URL, "/mypage")
    page = session.get(MyPage)

回答 7

这将在Python中为您工作;

# Call JIRA API with HTTPBasicAuth
import json
import requests
from requests.auth import HTTPBasicAuth

JIRA_EMAIL = "****"
JIRA_TOKEN = "****"
BASE_URL = "https://****.atlassian.net"
API_URL = "/rest/api/3/serverInfo"

API_URL = BASE_URL+API_URL

BASIC_AUTH = HTTPBasicAuth(JIRA_EMAIL, JIRA_TOKEN)
HEADERS = {'Content-Type' : 'application/json;charset=iso-8859-1'}

response = requests.get(
    API_URL,
    headers=HEADERS,
    auth=BASIC_AUTH
)

print(json.dumps(json.loads(response.text), sort_keys=True, indent=4, separators=(",", ": ")))

This will work for you in Python;

# Call JIRA API with HTTPBasicAuth
import json
import requests
from requests.auth import HTTPBasicAuth

JIRA_EMAIL = "****"
JIRA_TOKEN = "****"
BASE_URL = "https://****.atlassian.net"
API_URL = "/rest/api/3/serverInfo"

API_URL = BASE_URL+API_URL

BASIC_AUTH = HTTPBasicAuth(JIRA_EMAIL, JIRA_TOKEN)
HEADERS = {'Content-Type' : 'application/json;charset=iso-8859-1'}

response = requests.get(
    API_URL,
    headers=HEADERS,
    auth=BASIC_AUTH
)

print(json.dumps(json.loads(response.text), sort_keys=True, indent=4, separators=(",", ": ")))