爬取Leetcode的每日一题（Java/Python）

抓取Leetcode的每日一题信息思路一（发送GraphQL Query获取数据）

参考文章：https://www.cnblogs.com/ZhaoxiCheung/p/9333476.html

接口分析

主要的数据存在于graphql/接口中：

https://leetcode-cn.com/graphql/

首页热门题目接口

是否AC状态查看接口

每日一题接口

构造 GraphQL Query来获取信息

在Headers下的Request Payload中我们可以看到一个query字段，这是我们要构造的 GraphQL Query 的一个重要信息。

利用Postman来分析接口

我们并不一开始就用代码来获取题目信息，而是先利用 Postman 来看看如何获取题目信息。右键 Network 下的 graphql 文件—>Copy—>Copy as cURL(bash)

接着我们打开Postman，点击左上角File里的import，然后找到Raw text栏

将copy下来的cURL粘贴到Raw text中，点击continue，就可以在Postman中查看

在这之前遇到了一个小问题，把copy all as cURL看成了copy as cURL，导致在Postman中解析错误。

curl解析的结果如下：

从解析的结果看，和我们在Headers中看到的query字段类似，不过有一些细节需要更改。

当然，如果不想直接粘贴复制的 cURL，那么我们可以自己在 Postman 中写 Header 和 Body，需要注意的是这边的 Content-Type是application/graphql，Body 中的 GraphQL 构造，参照 Request Payload 中的query的字段来构造

利用Java的Jsoup和okhttp库来发送http请求和解析Json数据

package com.example.leetcode_card.utils;
import com.alibaba.fastjson.JSONObject;
import okhttp3.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import java.io.IOException;
import java.util.Map;
import java.util.Objects;

public class GraphqlUtil {

    private static String base_URL = "https://leetcode-cn.com";
    private static String questionUrl = "https://leetcode-cn.com/problems/two-sum/description/";
    private static String GRAPHQL_URL = "https://leetcode-cn.com/graphql";

    public GraphqlUtil() {

    }

    public static String getContent(String title) throws IOException {
        Connection.Response response = Jsoup.connect(questionUrl)
                .method(Connection.Method.GET)
                .execute();

        String csrftoken = response.cookie("aliyungf_tc");
        String __cfduid = response.cookie("__cfduid");
        OkHttpClient client = new OkHttpClient.Builder()
                .followRedirects(false)
                .followSslRedirects(false)
                .build();

        String query = "query{   question(titleSlug:"%s") {  questionId   translatedTitle    translatedContent    difficulty   }   }";
        String postBody = String.format(query,title);
        assert csrftoken != null;
        Request request = new Request.Builder()
                .addHeader("Content-Type","application/graphql")
                .addHeader("Referer",questionUrl)
                .addHeader("cookie","__cfduid=" + __cfduid + ";" + "csrftoken=" + csrftoken)
                .addHeader("x-csrftoken",csrftoken)
                .url(GRAPHQL_URL)
                .post(RequestBody.create(MediaType.parse("application/graphql; charset=utf-8"),postBody))
                .build();

        Response response1 = client.newCall(request).execute();
        //由于json的原因，返回的数据中文变成了Unicode码，需要另外解码
        return unicodetoString(response1.body().string());
    }

    //获取每日一题的题目内容(英文),用来构建完整的请求API
    public static String getTitle() throws IOException {
        Connection.Response response = Jsoup.connect(questionUrl)
                .method(Connection.Method.GET)
                .execute();

        String csrftoken = response.cookie("aliyungf_tc");
        String __cfduid = response.cookie("__cfduid");
        OkHttpClient client = new OkHttpClient.Builder()
                .followRedirects(false)
                .followSslRedirects(false)
                .build();

        // 获取LeetCode题目标题时的查询字符串
        String postBody = "query questionOfToday { todayRecord { question { questionFrontendId questionTitleSlug __typename } lastSubmission { id __typename } date userStatus __typename }}";
        assert csrftoken != null;
        Request request = new Request.Builder()
                .addHeader("Content-Type","application/graphql")
                .addHeader("Referer",questionUrl)
                .addHeader("cookie","__cfduid=" + __cfduid + ";" + "csrftoken=" + csrftoken)
                .addHeader("x-csrftoken",csrftoken)
                .url(GRAPHQL_URL)
                .post(RequestBody.create(MediaType.parse("application/graphql; charset=utf-8"),postBody))
                .build();

        Response response1 = client.newCall(request).execute();
        String titleInfo = unicodetoString(response1.body().string());
        //将title解析出来
        JSONObject jsonObject = JSONObject.parseObject(titleInfo);

        return jsonObject.getJSONObject("data")
                .getJSONArray("todayRecord")
                .getJSONObject(0)
                .getJSONObject("question")
                .getString("questionTitleSlug");
    }

    //解码
    public static String unicodetoString(String unicode) {
        if (unicode == null || "".equals(unicode)) {
            return null;
        }
        StringBuilder sb = new StringBuilder();
        int i = -1;
        int pos = 0;
        while ((i = unicode.indexOf("\u", pos)) != -1) {
            sb.append(unicode.substring(pos, i));
            if (i + 5 < unicode.length()) {
                pos = i + 6;
                sb.append((char) Integer.parseInt(unicode.substring(i + 2, i + 6), 16));
            }
        }
        sb.append(unicode.substring(pos));
        return sb.toString();
    }
}

引入的maven库：



    4.0.0

    org.example
    LeetcodeSpider
    1.0-SNAPSHOT
    
        
        
            org.jsoup
            jsoup
            1.14.3
        
        
        
            com.squareup.okhttp3
            okhttp
            4.9.2
        
        
        
            org.apache.httpcomponents
            httpclient
            4.5.12
        
        
        
            top.jfunc.common
            converter
            1.8.0

思路二(利用python爬虫爬取GraphQL接口)

参考文章：https://blog.csdn.net/malloc_can/article/details/113004579

# coding= ： # coding=utf-8
from datetime import datetime
import requests
import json
import smtplib
from email.mime.text import MIMEText

base_url = 'https://leetcode-cn.com'

# 获取今日每日一题的题名(英文)
response = requests.post(base_url + "/graphql", json={
    "operationName": "questionOfToday",
    "variables": {},
    "query": "query questionOfToday { todayRecord {   question {     questionFrontendId     questionTitleSlug     __typename   }   lastSubmission {     id     __typename   }   date   userStatus   __typename }}"
})
leetcodeTitle = json.loads(response.text).get('data').get('todayRecord')[0].get("question").get('questionTitleSlug')

# 获取今日每日一题的所有信息
url = base_url + "/problems/" + leetcodeTitle
response = requests.post(base_url + "/graphql",
                         json={"operationName": "questionData", "variables": {"titleSlug": leetcodeTitle},
                               "query": "query questionData($titleSlug: String!) {  question(titleSlug: $titleSlug) {    questionId    questionFrontendId    boundTopicId    title    titleSlug    content    translatedTitle    translatedContent    isPaidonly    difficulty    likes    dislikes    isLiked    similarQuestions    contributors {      username      profileUrl      avatarUrl      __typename    }    langToValidPlayground    topicTags {      name      slug      translatedName      __typename    }    companyTagStats    codeSnippets {      lang      langSlug      code      __typename    }    stats    hints    solution {      id      canSeeDetail      __typename    }    status    sampleTestCase    metaData    judgerAvailable    judgeType    mysqlSchemas    enableRunCode    envInfo    book {      id      bookName      pressName      source      shortDescription      fullDescription      bookImgUrl      pressImgUrl      productUrl      __typename    }    isSubscribed    isDailyQuestion    dailyRecordStatus    editorType    ugcQuestionId    style    __typename  }}"})
# 转化成json格式
jsonText = json.loads(response.text).get('data').get("question")
# 题目题号
no = jsonText.get('questionFrontendId')
# 题名（中文）
leetcodeTitle = jsonText.get('translatedTitle')
# 题目难度级别
level = jsonText.get('difficulty')
# 题目内容
context = jsonText.get('translatedContent')

# print(leetcodeTitle)
# print(context)
# print(level)
# print(no)

# 早安语录接口（天行数据API，自行申请免费））
response = requests.get("")
json = json.loads(response.text)
# 得到语录数据
ana = json.get('newslist')[0].get('content')
# 表情链接
face_url = 'http://wx3.sinaimg.cn/large/007hyfXLly1g0uj7x5jpaj301o02a0sw.jpg'

# 开始运行时间（可通过配置文件解耦）
begin_time = datetime(2020, 12, 23)
# 脚本运行时间计算
info = "本脚本已运行{0}天".format(
    (datetime.today() - begin_time).days.__str__())

# 数据全部HTML化
htmlText = """ 
        
        
        
            code {
                color: blue;
                font-size: larger;
            }
        
        
    
    
     
早安语录:""" + ana + """ + face_url + """">

    Leetcode-每日一题
    """ + no + '.' + leetcodeTitle + '.' + level + """""" + context + '本题连接： + url + ">" + url + "" + info

爬取Leetcode的每日一题（Java/Python）

Java相关栏目本月热门文章