python下载pda论坛收藏和发出的帖子

# coding=utf-8
from requests_html import HTMLSession
from tqdm import tqdm
import os
import time
import re
import sys
import hashlib

if not os.path.exists('./hparchive'):
    os.makedirs('./hparchive')

tidlist = {}
useragent = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15'}
hpsession = HTMLSession()
hpsession.headers.update(useragent)
hpsession.proxies= proxies = {'http': 'http://localhost:1080', 'https': 'http://localhost:1080'}


def userlogin():
    '''用户登录'''
    questionids = {
        '0': '无安全提问',
        '1': '母亲的名字',
        '2': '爷爷的名字',
        '3': '父亲出生的城市',
        '4': '您其中一位老师的名字',
        '5': '您个人计算机的型号',
        '6': '您最喜欢的餐馆名称',
        '7': '驾驶执照的最后四位数字',
    }
    usernameinput = input('用户名： ')
    pwdinput = input('密码: ')
    username = usernameinput.encode('gbk')
    # pwd = hashlib.md5(pwdinput.encode('gbk')).hexdigest()
    pwd = pwdinput.encode('gbk')
    data = {'loginfield': 'username', 'username': username, 'password': pwd}
    print('按数字选择安全提问,没有就直接回车或者选择0')
    for question in questionids:
        print(question, ':  ', questionids[question])
    questionid = '0'
    while True:
        questionidinput = input('按数字选择安全提问:')
        if questionidinput == '':
            break
        if (questionidinput in ['0', '1', '2', '3', '4', '5', '6', '7']):
            questionid = questionidinput
            break

    print('安全提问: ', questionids[questionid])
    if (questionid != '0'):
        answerinput = input('输入安全提问答案:')
        answer = answerinput.encode('gbk')
        data['questionid'] = questionid
        data['answer'] = answer

    # print(data)
    loginurl = 'https://www.hi-pda.com/forum/logging.php?action=login&loginsubmit=yes&inajax=1'
    result = hpsession.post(loginurl, data=data)
    print(result.request.body)
    print(result.text)
    # print(result.request.headers)
    resultgbk = result.content.decode('gbk')
    # print(resultgbk)
    if '密码错误次数过多' in resultgbk:
        print('错误次数过多,请15分钟后再试,建议使用浏览器登录查看详情')
        sys.exit(0)

    if '可以尝试' in resultgbk:
        print(resultgbk)
        sys.exit(0)

    if '请填写安全提问以及正确的答案' in resultgbk or '选择错误' in resultgbk:
        print('没有填写安全提问或者答案不正确')
        sys.exit(0)


def getlist(page=1, listtype='-fav'):
    '''从我的收藏/我的帖子页面获取帖子标题及tid'''
    baseurl = 'https://www.hi-pda.com/forum/my.php?item=favorites&type=thread'
    if (listtype == '-mypost'):
        baseurl = 'https://www.hi-pda.com/forum/my.php?item=threads'

    if (page == 1):
        listurl = baseurl
    else:
        listurl = baseurl + '&page=' + str(page)

    listpage = hpsession.get(listurl)
    # print(listurl)

    tbodysel = '#wrap > div.main > div > div.threadlist.datalist > form > table > tbody'
    if (listtype == '-mypost'):
        tbodysel = 'tbody'
    # print(listtype,tbodysel)

    tbody = listpage.html.find(tbodysel, first=True)
    listitem = tbody.find('tr > th > a')
    # print(tidlist)

    for a in listitem:
        tid = a.attrs['href'].split('tid=')[1].split('&')[0]
        tidlist[tid] = a.text
    nextpage = listpage.html.find('a.next')
    hasnextpage = len(nextpage) > 0
    time.sleep(0.1)
    if (hasnextpage):
        getlist(page=page + 1, listtype=listtype)


def genTOC(listtype='-fav'):
    '''生成目录HTML文件'''
    filename = 'fav.html'
    if (listtype == '-mypost'):
        filename = 'mypost.html'
    tocs = ''
    for i in tidlist:
        tocs = tocs + ' + str(i) + '-1.html" target="_blank">' + tidlist[i] + '
' + 'n'

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(tocs)


def savethread(tid, page=1, pagetype='norm'):
    '''下载tid对应的帖子html,如果是多页的帖子会自动连续下载,直到
    页面里找不到下一页按钮时停止'''
    rawurl = 'https://www.hi-pda.com/forum/viewthread.php?tid='

    if pagetype == '--print':
        printableurl = rawurl + str(tid) + '&action=printable'
        printr = hpsession.get(printableurl)
        with open('./hparchive/' + str(tid) + '-' + str(page) + '.html', 'w', encoding='gb18030') as f:
            f.write(printr.html.html)
        return

    if (page == 1):

        threadurl = rawurl + str(tid)

    else:
        threadurl = rawurl + str(tid) + '&extra=&page=' + str(page)

    r = hpsession.get(threadurl)
    r1 = r'viewthread.php?tid=' + tid + r'&extra=&page=(d+)'  # 页码按钮的链接的正则,分组1是指向的页码
    r2 = tid + r'-1.html'  # 替换成tid-页码.html的形式,即指向本地的html文件
    modhtml = re.sub(r1, r2, r.html.html)
    if (r.status_code != 200):
        print(tidlist[tid])
    with open('./hparchive/' + str(tid) + '-' + str(page) + '.html', 'w', encoding='gb18030') as f:
        # 直接使用utf-8会乱码,全部转换utf-8可能会有兼容问题,所以还是保持原编码不变,因为gbk在某些特殊字符会报错,使用gb18030
        f.write(modhtml)

    nextpage = r.html.find('a.next')
    hasnextpage = len(nextpage) > 0
    time.sleep(0.1)
    if (hasnextpage):
        savethread(tid, page=page + 1)


def work():
    listtype = '-fav'
    pagetype = 'norm'
    helpmsg = '参数错误,-fav收藏帖,-mypost我的发帖,--print加载打印版网页(只有前两页内容，但是速度较快)'
    if (len(sys.argv) > 3):
        print('参数错误,正确用法:python hparchive (-fav,-mypost,--print)')
        sys.exit(1)

    elif (len(sys.argv) == 3):
        listtype = sys.argv[1]
        pagetype = sys.argv[2]
        if listtype not in ['-fav', '-mypost']:
            print(helpmsg)
            sys.exit(1)
        elif pagetype != '--print':
            print(helpmsg)
            sys.exit(1)

    elif (len(sys.argv) == 2):
        listtype = sys.argv[1]
        if listtype not in ['-fav', '-mypost']:
            print(helpmsg)
            sys.exit(1)

    userlogin()
    getlist(page=1, listtype=listtype)
    print('一共' + str(len(tidlist)) + '个贴子')
    genTOC(listtype=listtype)

    is_windows = (os.name == 'nt')  # 判断系统类型，windows下进度条会有问题，设置ascii为True
    for tid in tqdm(tidlist, ascii=is_windows):
        savethread(tid, page=1, pagetype=pagetype)
        time.sleep(0.3)


if __name__ == "__main__":
    work()
参考
https://github.com/fqxufo/hparchive
python下载pda论坛 收藏和发出的帖子

Python相关栏目本月热门文章

python下载pda论坛收藏和发出的帖子