Python文档比较项目

下面的代码是通过向量内积来比较两个文档的相似程度，通常老师会查看两个同学的电子文档版的作业就可以用此方式比较。结果越小，表示相似程度越小。结果越大，相似程度越大。

# jibensuanfa.py
import sys
import math


# 读入文件：
def read_file(filename):
    try:
        fp = open(filename)
        L = fp.readlines()
    except IOError:
        print("Error opening or reading input file: ", filename)
        sys.exit()
        fp.close()
    return L


# 将字符组合成单词：
def get_words_from_string(line):
    word_list = []
    character_list = []
    for c in line:
        if c.isalnum():
            character_list.append(c)
        elif len(character_list) > 0:
            word = "".join(character_list)
            word = str.lower(word)
            word_list.append(word)
            character_list = []
    if len(character_list) > 0:
        word = "".join(character_list)
        word = str(word)
        word_list.append(word)
    return word_list


# 得到文档的单词：
def get_words_from_line_list(L):
    word_list = []
    for line in L:
        words_in_line = get_words_from_string(line)
        word_list = word_list + words_in_line
    return word_list


# 计算文件中每一个单词出现的频率：
def count_frequency(word_list):
    L = []
    for new_word in word_list:
        for entry in L:
            if new_word == entry[0]:
                entry[1] = entry[1] + 1
                break
        else:
            L.append([new_word, 1])
    return L


# 计算两向量内积：
def inner_product(L1, L2):
    sum =0.0
    for word1, count1 in L1:
        for word2, count2 in L2:
            if word1 == word2:
                sum += count1 * count2
    return sum


# 计算两向量夹角：
def vector_angle(L1, L2):
    numerator = inner_product(L1, L2)
    denominator = math.sqrt(inner_product(L1, L1) * inner_product(L2, L2))
    return math.acos(numerator/denominator)


# 主函数：
if __name__ == "__main__":
    file1 = r"C:UserslvDesktop算法设计与分析作业实验三（文档分析和拼音纠错）文档sample2.txt"
    file2 = r"C:UserslvDesktop算法设计与分析作业实验三（文档分析和拼音纠错）文档sample3.txt"
    # 读入文件：
    L1 = read_file(file1)
    L2 = read_file(file2)

    # 得到文档的单词：
    word_list1 = get_words_from_line_list(L1)
    word_list2 = get_words_from_line_list(L2)
    # print(word_list1)
    # print(word_list2)

    # 计算文件中每一个单词出现的频率：
    LL1 = count_frequency(word_list1)
    LL2 = count_frequency(word_list2)
    # print(LL1)
    # print(LL2)

    # 计算两向量的夹角：
    angle = vector_angle(LL1, LL2)
    print(angle)

运行结果如下：

D:tools2Anacondapython.exe C:/Users/lv/Desktop/算法设计与分析作业/实验三（文档分析和拼音纠错）/daima/wendangbijiao/jibensuanfa.py
0.870475401517548

进程已结束，退出代码为 0

以下程序是在上面的某些方法上做了优化后的得出的结果：

# youhua.py
from wendangbijiao import jibensuanfa as jb   # 导入自定义的模块
# 算法优化：


# 对向量内的元素进行排序预处理
def word_frequencies_for_file(filename):
    line_list = jb.read_file(filename)
    word_list = jb.get_words_from_line_list(line_list)
    freq_mapping = jb.count_frequency(word_list)
    sorted_freq_mapping = sorted(freq_mapping)
    print("File", filename, ":",)
    print(len(line_list), "lines,",)
    print(len(word_list), "words,",)
    print(len(sorted_freq_mapping), "distinct words")

    return sorted_freq_mapping


# 内积计算优化：
def inner_product(L1, L2):
    sum = 0.0
    i = 0
    j = 0
    while i < len(L1) and j < len(L2):
        if L1[i][0] == L2[j][0]:    # 两个都有的单词才算内积
            sum += L1[i][1] + L2[j][1]
            i += 1
            j += 1
        elif L1[i][0] < L2[j][0]:       # 单词L1[i][0]在L1不在L2
            i += 1
        else:           # 单词L1[i][0]在L2但不在L1
            j += 1
    return sum


# 利用字典数据结构计算每一个单词出现的频率：
def count_frequency(word_list):
    D = {}
    for new_word in word_list:
        if D.has_key(new_word):
            D[new_word] = D[new_word] + 1
        else:
            D[new_word] = 1
    return D.items()


# 文档比较主函数：
def main():
    filename_1 = r"C:UserslvDesktop算法设计与分析作业实验三（文档分析和拼音纠错）文档sample2.txt"  # 表示全字符串
    filename_2 = r"C:UserslvDesktop算法设计与分析作业实验三（文档分析和拼音纠错）文档sample3.txt"
    sorted_word_list_1 = word_frequencies_for_file(filename_1)
    sorted_word_list_2 = word_frequencies_for_file(filename_2)
    distance = jb.vector_angle(sorted_word_list_1, sorted_word_list_2)
    print("The distance betweeen the documenets is : %0.6f (radians)" % distance)


if __name__ == "__main__":
    import cProfile
    cProfile.run("main()")

运行结果如下：

D:tools2Anacondapython.exe C:/Users/lv/Desktop/算法设计与分析作业/实验三（文档分析和拼音纠错）/daima/wendangbijiao/youhua.py
File C:UserslvDesktop算法设计与分析作业实验三（文档分析和拼音纠错）文档sample2.txt :
13 lines,
425 words,
296 distinct words
File C:UserslvDesktop算法设计与分析作业实验三（文档分析和拼音纠错）文档sample3.txt :
21 lines,
548 words,
373 distinct words
The distance betweeen the documenets is : 0.870475 (radians)
         16499 function calls in 0.020 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.020    0.020 :1()
        2    0.000    0.000    0.000    0.000 _bootlocale.py:11(getpreferredencoding)
       34    0.003    0.000    0.004    0.000 jibensuanfa.py:19(get_words_from_string)
        2    0.000    0.000    0.004    0.002 jibensuanfa.py:38(get_words_from_line_list)
        2    0.005    0.002    0.005    0.002 jibensuanfa.py:47(count_frequency)
        3    0.009    0.003    0.009    0.003 jibensuanfa.py:60(inner_product)
        2    0.000    0.000    0.000    0.000 jibensuanfa.py:7(read_file)
        1    0.000    0.000    0.009    0.009 jibensuanfa.py:70(vector_angle)
        1    0.000    0.000    0.020    0.020 youhua.py:49(main)
        2    0.000    0.000    0.010    0.005 youhua.py:7(word_frequencies_for_file)
        2    0.000    0.000    0.000    0.000 {built-in method _locale._getdefaultlocale}
        1    0.000    0.000    0.020    0.020 {built-in method builtins.exec}
     1271    0.000    0.000    0.000    0.000 {built-in method builtins.len}
        9    0.000    0.000    0.000    0.000 {built-in method builtins.print}
        2    0.000    0.000    0.000    0.000 {built-in method builtins.sorted}
        2    0.000    0.000    0.000    0.000 {built-in method io.open}
        1    0.000    0.000    0.000    0.000 {built-in method math.acos}
        1    0.000    0.000    0.000    0.000 {built-in method math.sqrt}
     6811    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
     6400    0.001    0.000    0.001    0.000 {method 'isalnum' of 'str' objects}
      973    0.000    0.000    0.000    0.000 {method 'join' of 'str' objects}
      973    0.000    0.000    0.000    0.000 {method 'lower' of 'str' objects}
        2    0.000    0.000    0.000    0.000 {method 'readlines' of '_io._IObase' objects}

进程已结束，退出代码为 0

Python文档比较项目

Python相关栏目本月热门文章