使用python请求会话登录LinkedIn

我修改了一个网络抓取模板，该模板可用于大多数基于Python的抓取需求，以满足您的需求。验证它是否可以使用我自己的登录信息。

它的工作方式是通过模仿浏览器并维护一个cookieJar来存储您的用户会话。也可以与BeautifulSoup一起使用。

注意： 这是Python2版本。我根据要求在下面进一步添加了一个可工作的Python3示例。

import cookielibimport osimport urllibimport urllib2import reimport stringfrom BeautifulSoup import BeautifulSoupusername = "user@email.com"password = "password"cookie_filename = "parser.cookies.txt"class linkedInParser(object):    def __init__(self, login, password):        """ Start up... """        self.login = login        self.password = password        # Simulate browser with cookies enabled        self.cj = cookielib.MozillacookieJar(cookie_filename)        if os.access(cookie_filename, os.F_OK): self.cj.load()        self.opener = urllib2.build_opener( urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0), urllib2.HTTPcookieProcessor(self.cj)        )        self.opener.addheaders = [ ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '     'Windows NT 5.2; .NET CLR 1.1.4322)'))        ]        # Login        self.loginPage()        title = self.loadTitle()        print title        self.cj.save()    def loadPage(self, url, data=None):        """        Utility function to load HTML from URLs for us with hack to continue despite 404        """        # We'll print the url in case of infinite loop        # print "Loading URL: %s" % url        try: if data is not None:     response = self.opener.open(url, data) else:     response = self.opener.open(url) return ''.join(response.readlines())        except: # If URL doesn't load for ANY reason, try again... # Quick and dirty solution for 404 returns because of network problems # However, this could infinite loop if there's an actual problem return self.loadPage(url, data)    def loginPage(self):        """        Handle login. This should populate our cookie jar.        """        html = self.loadPage("https://www.linkedin.com/")        soup = BeautifulSoup(html)        csrf = soup.find(id="loginCsrfParam-login")['value']        login_data = urllib.urlenpre({ 'session_key': self.login, 'session_password': self.password, 'loginCsrfParam': csrf,        })        html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)        return    def loadTitle(self):        html = self.loadPage("https://www.linkedin.com/feed/")        soup = BeautifulSoup(html)        return soup.find("title")parser = linkedInParser(username, password)

2014年6月19日更新： 从首页添加了对CSRF令牌的解析，以用于更新的登录过程。

2015年7月23日更新： 在此处添加Python
3示例。基本上需要替换库位置并删除不推荐使用的方法。它的格式不完美，也不起作用，但是可以正常工作。对不起紧急工作。最后，原理和步骤是相同的。

import http.cookiejar as cookielibimport osimport urllibimport reimport stringfrom bs4 import BeautifulSoupusername = "user@email.com"password = "password"cookie_filename = "parser.cookies.txt"class linkedInParser(object):    def __init__(self, login, password):        """ Start up... """        self.login = login        self.password = password        # Simulate browser with cookies enabled        self.cj = cookielib.MozillacookieJar(cookie_filename)        if os.access(cookie_filename, os.F_OK): self.cj.load()        self.opener = urllib.request.build_opener( urllib.request.HTTPRedirectHandler(), urllib.request.HTTPHandler(debuglevel=0), urllib.request.HTTPSHandler(debuglevel=0), urllib.request.HTTPcookieProcessor(self.cj)        )        self.opener.addheaders = [ ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '     'Windows NT 5.2; .NET CLR 1.1.4322)'))        ]        # Login        self.loginPage()        title = self.loadTitle()        print(title)        self.cj.save()    def loadPage(self, url, data=None):        """        Utility function to load HTML from URLs for us with hack to continue despite 404        """        # We'll print the url in case of infinite loop        # print "Loading URL: %s" % url        try: if data is not None:     response = self.opener.open(url, data) else:     response = self.opener.open(url) return ''.join([str(l) for l in response.readlines()])        except Exception as e: # If URL doesn't load for ANY reason, try again... # Quick and dirty solution for 404 returns because of network problems # However, this could infinite loop if there's an actual problem return self.loadPage(url, data)    def loadSoup(self, url, data=None):        """        Combine loading of URL, HTML, and parsing with BeautifulSoup        """        html = self.loadPage(url, data)        soup = BeautifulSoup(html, "html5lib")        return soup    def loginPage(self):        """        Handle login. This should populate our cookie jar.        """        soup = self.loadSoup("https://www.linkedin.com/")        csrf = soup.find(id="loginCsrfParam-login")['value']        login_data = urllib.parse.urlenpre({ 'session_key': self.login, 'session_password': self.password, 'loginCsrfParam': csrf,        }).enpre('utf8')        self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)        return    def loadTitle(self):        soup = self.loadSoup("https://www.linkedin.com/feed/")        return soup.find("title")parser = linkedInParser(username, password)

使用python请求会话登录LinkedIn

面试问答相关栏目本月热门文章