栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python下载pda论坛 收藏和发出的帖子

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python下载pda论坛 收藏和发出的帖子

# coding=utf-8
from requests_html import HTMLSession
from tqdm import tqdm
import os
import time
import re
import sys
import hashlib

if not os.path.exists('./hparchive'):
    os.makedirs('./hparchive')

tidlist = {}
useragent = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15'}
hpsession = HTMLSession()
hpsession.headers.update(useragent)
hpsession.proxies= proxies = {'http': 'http://localhost:1080', 'https': 'http://localhost:1080'}


def userlogin():
    '''用户登录'''
    questionids = {
        '0': '无安全提问',
        '1': '母亲的名字',
        '2': '爷爷的名字',
        '3': '父亲出生的城市',
        '4': '您其中一位老师的名字',
        '5': '您个人计算机的型号',
        '6': '您最喜欢的餐馆名称',
        '7': '驾驶执照的最后四位数字',
    }
    usernameinput = input('用户名: ')
    pwdinput = input('密码: ')
    username = usernameinput.encode('gbk')
    # pwd = hashlib.md5(pwdinput.encode('gbk')).hexdigest()
    pwd = pwdinput.encode('gbk')
    data = {'loginfield': 'username', 'username': username, 'password': pwd}
    print('按数字选择安全提问,没有就直接回车或者选择0')
    for question in questionids:
        print(question, ':  ', questionids[question])
    questionid = '0'
    while True:
        questionidinput = input('按数字选择安全提问:')
        if questionidinput == '':
            break
        if (questionidinput in ['0', '1', '2', '3', '4', '5', '6', '7']):
            questionid = questionidinput
            break

    print('安全提问: ', questionids[questionid])
    if (questionid != '0'):
        answerinput = input('输入安全提问答案:')
        answer = answerinput.encode('gbk')
        data['questionid'] = questionid
        data['answer'] = answer

    # print(data)
    loginurl = 'https://www.hi-pda.com/forum/logging.php?action=login&loginsubmit=yes&inajax=1'
    result = hpsession.post(loginurl, data=data)
    print(result.request.body)
    print(result.text)
    # print(result.request.headers)
    resultgbk = result.content.decode('gbk')
    # print(resultgbk)
    if '密码错误次数过多' in resultgbk:
        print('错误次数过多,请15分钟后再试,建议使用浏览器登录查看详情')
        sys.exit(0)

    if '可以尝试' in resultgbk:
        print(resultgbk)
        sys.exit(0)

    if '请填写安全提问以及正确的答案' in resultgbk or '选择错误' in resultgbk:
        print('没有填写安全提问或者答案不正确')
        sys.exit(0)


def getlist(page=1, listtype='-fav'):
    '''从我的收藏/我的帖子页面获取帖子标题及tid'''
    baseurl = 'https://www.hi-pda.com/forum/my.php?item=favorites&type=thread'
    if (listtype == '-mypost'):
        baseurl = 'https://www.hi-pda.com/forum/my.php?item=threads'

    if (page == 1):
        listurl = baseurl
    else:
        listurl = baseurl + '&page=' + str(page)

    listpage = hpsession.get(listurl)
    # print(listurl)

    tbodysel = '#wrap > div.main > div > div.threadlist.datalist > form > table > tbody'
    if (listtype == '-mypost'):
        tbodysel = 'tbody'
    # print(listtype,tbodysel)

    tbody = listpage.html.find(tbodysel, first=True)
    listitem = tbody.find('tr > th > a')
    # print(tidlist)

    for a in listitem:
        tid = a.attrs['href'].split('tid=')[1].split('&')[0]
        tidlist[tid] = a.text
    nextpage = listpage.html.find('a.next')
    hasnextpage = len(nextpage) > 0
    time.sleep(0.1)
    if (hasnextpage):
        getlist(page=page + 1, listtype=listtype)


def genTOC(listtype='-fav'):
    '''生成目录HTML文件'''
    filename = 'fav.html'
    if (listtype == '-mypost'):
        filename = 'mypost.html'
    tocs = ''
    for i in tidlist:
        tocs = tocs + ' + str(i) + '-1.html" target="_blank">' + tidlist[i] + '
' + 'n' with open(filename, 'w', encoding='utf-8') as f: f.write(tocs) def savethread(tid, page=1, pagetype='norm'): '''下载tid对应的帖子html,如果是多页的帖子会自动连续下载,直到 页面里找不到下一页按钮时停止''' rawurl = 'https://www.hi-pda.com/forum/viewthread.php?tid=' if pagetype == '--print': printableurl = rawurl + str(tid) + '&action=printable' printr = hpsession.get(printableurl) with open('./hparchive/' + str(tid) + '-' + str(page) + '.html', 'w', encoding='gb18030') as f: f.write(printr.html.html) return if (page == 1): threadurl = rawurl + str(tid) else: threadurl = rawurl + str(tid) + '&extra=&page=' + str(page) r = hpsession.get(threadurl) r1 = r'viewthread.php?tid=' + tid + r'&extra=&page=(d+)' # 页码按钮的链接的正则,分组1是指向的页码 r2 = tid + r'-1.html' # 替换成tid-页码.html的形式,即指向本地的html文件 modhtml = re.sub(r1, r2, r.html.html) if (r.status_code != 200): print(tidlist[tid]) with open('./hparchive/' + str(tid) + '-' + str(page) + '.html', 'w', encoding='gb18030') as f: # 直接使用utf-8会乱码,全部转换utf-8可能会有兼容问题,所以还是保持原编码不变,因为gbk在某些特殊字符会报错,使用gb18030 f.write(modhtml) nextpage = r.html.find('a.next') hasnextpage = len(nextpage) > 0 time.sleep(0.1) if (hasnextpage): savethread(tid, page=page + 1) def work(): listtype = '-fav' pagetype = 'norm' helpmsg = '参数错误,-fav收藏帖,-mypost我的发帖,--print加载打印版网页(只有前两页内容,但是速度较快)' if (len(sys.argv) > 3): print('参数错误,正确用法:python hparchive (-fav,-mypost,--print)') sys.exit(1) elif (len(sys.argv) == 3): listtype = sys.argv[1] pagetype = sys.argv[2] if listtype not in ['-fav', '-mypost']: print(helpmsg) sys.exit(1) elif pagetype != '--print': print(helpmsg) sys.exit(1) elif (len(sys.argv) == 2): listtype = sys.argv[1] if listtype not in ['-fav', '-mypost']: print(helpmsg) sys.exit(1) userlogin() getlist(page=1, listtype=listtype) print('一共' + str(len(tidlist)) + '个贴子') genTOC(listtype=listtype) is_windows = (os.name == 'nt') # 判断系统类型,windows下进度条会有问题,设置ascii为True for tid in tqdm(tidlist, ascii=is_windows): savethread(tid, page=1, pagetype=pagetype) time.sleep(0.3) if __name__ == "__main__": work()

参考
https://github.com/fqxufo/hparchive

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/870566.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号