下面的代码是通过向量内积来比较两个文档的相似程度,通常老师会查看两个同学的电子文档版的作业就可以用此方式比较。结果越小,表示相似程度越小。结果越大,相似程度越大。
# jibensuanfa.py
import sys
import math
# 读入文件:
def read_file(filename):
try:
fp = open(filename)
L = fp.readlines()
except IOError:
print("Error opening or reading input file: ", filename)
sys.exit()
fp.close()
return L
# 将字符组合成单词:
def get_words_from_string(line):
word_list = []
character_list = []
for c in line:
if c.isalnum():
character_list.append(c)
elif len(character_list) > 0:
word = "".join(character_list)
word = str.lower(word)
word_list.append(word)
character_list = []
if len(character_list) > 0:
word = "".join(character_list)
word = str(word)
word_list.append(word)
return word_list
# 得到文档的单词:
def get_words_from_line_list(L):
word_list = []
for line in L:
words_in_line = get_words_from_string(line)
word_list = word_list + words_in_line
return word_list
# 计算文件中每一个单词出现的频率:
def count_frequency(word_list):
L = []
for new_word in word_list:
for entry in L:
if new_word == entry[0]:
entry[1] = entry[1] + 1
break
else:
L.append([new_word, 1])
return L
# 计算两向量内积:
def inner_product(L1, L2):
sum =0.0
for word1, count1 in L1:
for word2, count2 in L2:
if word1 == word2:
sum += count1 * count2
return sum
# 计算两向量夹角:
def vector_angle(L1, L2):
numerator = inner_product(L1, L2)
denominator = math.sqrt(inner_product(L1, L1) * inner_product(L2, L2))
return math.acos(numerator/denominator)
# 主函数:
if __name__ == "__main__":
file1 = r"C:UserslvDesktop算法设计与分析作业实验三(文档分析和拼音纠错)文档sample2.txt"
file2 = r"C:UserslvDesktop算法设计与分析作业实验三(文档分析和拼音纠错)文档sample3.txt"
# 读入文件:
L1 = read_file(file1)
L2 = read_file(file2)
# 得到文档的单词:
word_list1 = get_words_from_line_list(L1)
word_list2 = get_words_from_line_list(L2)
# print(word_list1)
# print(word_list2)
# 计算文件中每一个单词出现的频率:
LL1 = count_frequency(word_list1)
LL2 = count_frequency(word_list2)
# print(LL1)
# print(LL2)
# 计算两向量的夹角:
angle = vector_angle(LL1, LL2)
print(angle)
运行结果如下:
D:tools2Anacondapython.exe C:/Users/lv/Desktop/算法设计与分析作业/实验三(文档分析和拼音纠错)/daima/wendangbijiao/jibensuanfa.py 0.870475401517548 进程已结束,退出代码为 0
以下程序是在上面的某些方法上做了优化后的得出的结果:
# youhua.py
from wendangbijiao import jibensuanfa as jb # 导入自定义的模块
# 算法优化:
# 对向量内的元素进行排序预处理
def word_frequencies_for_file(filename):
line_list = jb.read_file(filename)
word_list = jb.get_words_from_line_list(line_list)
freq_mapping = jb.count_frequency(word_list)
sorted_freq_mapping = sorted(freq_mapping)
print("File", filename, ":",)
print(len(line_list), "lines,",)
print(len(word_list), "words,",)
print(len(sorted_freq_mapping), "distinct words")
return sorted_freq_mapping
# 内积计算优化:
def inner_product(L1, L2):
sum = 0.0
i = 0
j = 0
while i < len(L1) and j < len(L2):
if L1[i][0] == L2[j][0]: # 两个都有的单词才算内积
sum += L1[i][1] + L2[j][1]
i += 1
j += 1
elif L1[i][0] < L2[j][0]: # 单词L1[i][0]在L1不在L2
i += 1
else: # 单词L1[i][0]在L2但不在L1
j += 1
return sum
# 利用字典数据结构计算每一个单词出现的频率:
def count_frequency(word_list):
D = {}
for new_word in word_list:
if D.has_key(new_word):
D[new_word] = D[new_word] + 1
else:
D[new_word] = 1
return D.items()
# 文档比较主函数:
def main():
filename_1 = r"C:UserslvDesktop算法设计与分析作业实验三(文档分析和拼音纠错)文档sample2.txt" # 表示全字符串
filename_2 = r"C:UserslvDesktop算法设计与分析作业实验三(文档分析和拼音纠错)文档sample3.txt"
sorted_word_list_1 = word_frequencies_for_file(filename_1)
sorted_word_list_2 = word_frequencies_for_file(filename_2)
distance = jb.vector_angle(sorted_word_list_1, sorted_word_list_2)
print("The distance betweeen the documenets is : %0.6f (radians)" % distance)
if __name__ == "__main__":
import cProfile
cProfile.run("main()")
运行结果如下:
D:tools2Anacondapython.exe C:/Users/lv/Desktop/算法设计与分析作业/实验三(文档分析和拼音纠错)/daima/wendangbijiao/youhua.py
File C:UserslvDesktop算法设计与分析作业实验三(文档分析和拼音纠错)文档sample2.txt :
13 lines,
425 words,
296 distinct words
File C:UserslvDesktop算法设计与分析作业实验三(文档分析和拼音纠错)文档sample3.txt :
21 lines,
548 words,
373 distinct words
The distance betweeen the documenets is : 0.870475 (radians)
16499 function calls in 0.020 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.020 0.020 :1()
2 0.000 0.000 0.000 0.000 _bootlocale.py:11(getpreferredencoding)
34 0.003 0.000 0.004 0.000 jibensuanfa.py:19(get_words_from_string)
2 0.000 0.000 0.004 0.002 jibensuanfa.py:38(get_words_from_line_list)
2 0.005 0.002 0.005 0.002 jibensuanfa.py:47(count_frequency)
3 0.009 0.003 0.009 0.003 jibensuanfa.py:60(inner_product)
2 0.000 0.000 0.000 0.000 jibensuanfa.py:7(read_file)
1 0.000 0.000 0.009 0.009 jibensuanfa.py:70(vector_angle)
1 0.000 0.000 0.020 0.020 youhua.py:49(main)
2 0.000 0.000 0.010 0.005 youhua.py:7(word_frequencies_for_file)
2 0.000 0.000 0.000 0.000 {built-in method _locale._getdefaultlocale}
1 0.000 0.000 0.020 0.020 {built-in method builtins.exec}
1271 0.000 0.000 0.000 0.000 {built-in method builtins.len}
9 0.000 0.000 0.000 0.000 {built-in method builtins.print}
2 0.000 0.000 0.000 0.000 {built-in method builtins.sorted}
2 0.000 0.000 0.000 0.000 {built-in method io.open}
1 0.000 0.000 0.000 0.000 {built-in method math.acos}
1 0.000 0.000 0.000 0.000 {built-in method math.sqrt}
6811 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
6400 0.001 0.000 0.001 0.000 {method 'isalnum' of 'str' objects}
973 0.000 0.000 0.000 0.000 {method 'join' of 'str' objects}
973 0.000 0.000 0.000 0.000 {method 'lower' of 'str' objects}
2 0.000 0.000 0.000 0.000 {method 'readlines' of '_io._IObase' objects}
进程已结束,退出代码为 0



