这是在二重循环方法的基础上的改进一
实际就相当于Apriori算法去掉剪枝步骤
import math
def read():
f = open("data.txt", 'r')
list1 = f.readlines()
f.close()
l1 = []
for line in list1:
x = []
row = line.split(" ")
for char in row:
if char != "n" and char != "":
num = int(char)
x.append(num)
if len(x) > 0:
l1.append(x)
return l1
def generate(lk,k):
ckk=[]
if k==1:
for i in range(len(lk)):
for j in range( i+1, len(lk) ):
l=[lk[i],lk[j]]
ckk.append(l)
else:
for i in range(len(lk)):
for j in range( i+1, len(lk) ):
list_union = list(set(lk[i]).union(set(lk[j])))
if ( len(list_union)==k+1 and list_union not in ckk):
ckk.append(list_union)
ckk.sort()
return ckk
def compute_support(origin_data,ckk):
ckk_count = []
for i in range(len(ckk)) :
ckk_count.append(0)
for row in origin_data :
for miniset in ckk :
if ( set(miniset).issubset(row) ) :
ckk_count[ckk.index(miniset)]+=1
continue
return ckk_count
def func(origin_data, support_num):
# 先生成l1
c1={}
for row in origin_data:
for num in row:
if ( num not in c1 ):
c1[num]=1
else:
c1[num]+=1
l1 = []
for i in c1:
if ( c1[i]>=support_num ):
l1.append(i)
l1.sort()
# 开始循环
lk = l1
k=1
count = 0 # 统计该项集有多少项
while len(lk)>0:
# 输出频繁K项集
print("{}项频繁集:".format(k))
col_num = 0
for i in lk:
# 每输出10列就输出一个换行
if (col_num%11==0 and col_num!=0):
print()
col_num=0
if (k==1):
print("{:>2}".format(i), end=' ')
else:
print(i, end=' ')
count+=1
col_num+=1
print("n")
# 获得Ck+1频繁项集的候选集
ckk = generate(lk,k)
# 另起一个表记录对应位置ckk每个元素出现的次数
ckk_count = compute_support(origin_data,ckk)
lk.clear()
for miniset in ckk:
if ( ckk_count[ckk.index(miniset)]>=support_num ):
lk.append(miniset)
k+=1
print("共{}项".format(count))
if __name__ == "__main__":
support = float(input("请输入最小支持度"))
origin_data = read()
support_num = math.ceil( support * len(origin_data))
print("数据行数:", len(origin_data),
"t最小支持度为:", support, "t最小支持数:", support_num)
func(origin_data, support_num)



