我修改了一个网络抓取模板,该模板可用于大多数基于Python的抓取需求,以满足您的需求。验证它是否可以使用我自己的登录信息。
它的工作方式是通过模仿浏览器并维护一个cookieJar来存储您的用户会话。也可以与BeautifulSoup一起使用。
注意: 这是Python2版本。我根据要求在下面进一步添加了一个可工作的Python3示例。
import cookielibimport osimport urllibimport urllib2import reimport stringfrom BeautifulSoup import BeautifulSoupusername = "user@email.com"password = "password"cookie_filename = "parser.cookies.txt"class linkedInParser(object): def __init__(self, login, password): """ Start up... """ self.login = login self.password = password # Simulate browser with cookies enabled self.cj = cookielib.MozillacookieJar(cookie_filename) if os.access(cookie_filename, os.F_OK): self.cj.load() self.opener = urllib2.build_opener( urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0), urllib2.HTTPcookieProcessor(self.cj) ) self.opener.addheaders = [ ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' 'Windows NT 5.2; .NET CLR 1.1.4322)')) ] # Login self.loginPage() title = self.loadTitle() print title self.cj.save() def loadPage(self, url, data=None): """ Utility function to load HTML from URLs for us with hack to continue despite 404 """ # We'll print the url in case of infinite loop # print "Loading URL: %s" % url try: if data is not None: response = self.opener.open(url, data) else: response = self.opener.open(url) return ''.join(response.readlines()) except: # If URL doesn't load for ANY reason, try again... # Quick and dirty solution for 404 returns because of network problems # However, this could infinite loop if there's an actual problem return self.loadPage(url, data) def loginPage(self): """ Handle login. This should populate our cookie jar. """ html = self.loadPage("https://www.linkedin.com/") soup = BeautifulSoup(html) csrf = soup.find(id="loginCsrfParam-login")['value'] login_data = urllib.urlenpre({ 'session_key': self.login, 'session_password': self.password, 'loginCsrfParam': csrf, }) html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data) return def loadTitle(self): html = self.loadPage("https://www.linkedin.com/feed/") soup = BeautifulSoup(html) return soup.find("title")parser = linkedInParser(username, password)2014年6月19日更新: 从首页添加了对CSRF令牌的解析,以用于更新的登录过程。
2015年7月23日更新: 在此处添加Python
3示例。基本上需要替换库位置并删除不推荐使用的方法。它的格式不完美,也不起作用,但是可以正常工作。对不起紧急工作。最后,原理和步骤是相同的。
import http.cookiejar as cookielibimport osimport urllibimport reimport stringfrom bs4 import BeautifulSoupusername = "user@email.com"password = "password"cookie_filename = "parser.cookies.txt"class linkedInParser(object): def __init__(self, login, password): """ Start up... """ self.login = login self.password = password # Simulate browser with cookies enabled self.cj = cookielib.MozillacookieJar(cookie_filename) if os.access(cookie_filename, os.F_OK): self.cj.load() self.opener = urllib.request.build_opener( urllib.request.HTTPRedirectHandler(), urllib.request.HTTPHandler(debuglevel=0), urllib.request.HTTPSHandler(debuglevel=0), urllib.request.HTTPcookieProcessor(self.cj) ) self.opener.addheaders = [ ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' 'Windows NT 5.2; .NET CLR 1.1.4322)')) ] # Login self.loginPage() title = self.loadTitle() print(title) self.cj.save() def loadPage(self, url, data=None): """ Utility function to load HTML from URLs for us with hack to continue despite 404 """ # We'll print the url in case of infinite loop # print "Loading URL: %s" % url try: if data is not None: response = self.opener.open(url, data) else: response = self.opener.open(url) return ''.join([str(l) for l in response.readlines()]) except Exception as e: # If URL doesn't load for ANY reason, try again... # Quick and dirty solution for 404 returns because of network problems # However, this could infinite loop if there's an actual problem return self.loadPage(url, data) def loadSoup(self, url, data=None): """ Combine loading of URL, HTML, and parsing with BeautifulSoup """ html = self.loadPage(url, data) soup = BeautifulSoup(html, "html5lib") return soup def loginPage(self): """ Handle login. This should populate our cookie jar. """ soup = self.loadSoup("https://www.linkedin.com/") csrf = soup.find(id="loginCsrfParam-login")['value'] login_data = urllib.parse.urlenpre({ 'session_key': self.login, 'session_password': self.password, 'loginCsrfParam': csrf, }).enpre('utf8') self.loadPage("https://www.linkedin.com/uas/login-submit", login_data) return def loadTitle(self): soup = self.loadSoup("https://www.linkedin.com/feed/") return soup.find("title")parser = linkedInParser(username, password)


