代码可以在github上fork,本文主要是加了一些注释,并且搭配本人所作笔记【HGAN代码加笔记的理解】
main.py
import torch
from sklearn.metrics import f1_score
from utils import load_data, EarlyStopping
def score(logits, labels):
#在类的方法或属性前加一个“_”单下划线,意味着该方法或属性不应该去调用,它并不属于API。
_, indices = torch.max(logits, dim=1)
prediction = indices.long().cpu().numpy()
labels = labels.cpu().numpy()
accuracy = (prediction == labels).sum() / len(prediction)
micro_f1 = f1_score(labels, prediction, average='micro')
macro_f1 = f1_score(labels, prediction, average='macro')
return accuracy, micro_f1, macro_f1
def evaluate(model, g, features, labels, mask, loss_func):
model.eval()
with torch.no_grad():
logits = model(g, features)
loss = loss_func(logits[mask], labels[mask])
accuracy, micro_f1, macro_f1 = score(logits[mask], labels[mask])
return loss, accuracy, micro_f1, macro_f1
def main(args):
# If args['hetero'] is True, g would be a heterogeneous graph.
# Otherwise, it will be a list of homogeneous graphs.
#Python中反斜杠也可以用在一行结尾做续行符使用
g, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask,
val_mask, test_mask = load_data(args['dataset'])
if hasattr(torch, 'BoolTensor'):
#hasattr() 函数用于判断对象是否包含对应的属性,此处是判断torch是否有'BoolTensor'属性,如果对象有该属性,则返回TRUE
#torch.DoubleTensor对应torch.float64; 猜测同理,BoolTensor应该也是Bool类型
train_mask = train_mask.bool()
val_mask = val_mask.bool()
test_mask = test_mask.bool()
# .to() 是cpu,args['device']结果是cpu
features = features.to(args['device'])
labels = labels.to(args['device'])
train_mask = train_mask.to(args['device'])
val_mask = val_mask.to(args['device'])
test_mask = test_mask.to(args['device'])
if args['hetero']: #构建异构图的邻居节点
from model_hetero import HAN
#此处的HAN跳进的是model_hetro的模型,猜测是,如果已经有异构图了,就找这样的元路径,然后做节点级别和语义级别的注意力机制
model = HAN(meta_paths=[['pa', 'ap'], ['pf', 'fp']],#之前构建的边:pa, ap,组合成mete-path:PAP
in_size=features.shape[1],
hidden_size=args['hidden_units'],
out_size=num_classes,
num_heads=args['num_heads'],
dropout=args['dropout']).to(args['device'])
g = g.to(args['device'])
else:
#自己构建异构图
from model import HAN
model = HAN(num_meta_paths=len(g),#meta_path的长度,g即之前定义的邻接矩阵,3
in_size=features.shape[1],#特征的维度 1870维
hidden_size=args['hidden_units'],
out_size=num_classes,
num_heads=args['num_heads'],#使用多头的数量
dropout=args['dropout']).to(args['device'])
g = [graph.to(args['device']) for graph in g]
stopper = EarlyStopping(patience=args['patience'])
loss_fcn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'],
weight_decay=args['weight_decay'])
for epoch in range(args['num_epochs']):
model.train()
logits = model(g, features)
loss = loss_fcn(logits[train_mask], labels[train_mask])
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_acc, train_micro_f1, train_macro_f1 = score(logits[train_mask], labels[train_mask])
val_loss, val_acc, val_micro_f1, val_macro_f1 = evaluate(model, g, features, labels, val_mask, loss_fcn)
early_stop = stopper.step(val_loss.data.item(), val_acc, model)
print('Epoch {:d} | Train Loss {:.4f} | Train Micro f1 {:.4f} | Train Macro f1 {:.4f} | '
'Val Loss {:.4f} | Val Micro f1 {:.4f} | Val Macro f1 {:.4f}'.format(
epoch + 1, loss.item(), train_micro_f1, train_macro_f1, val_loss.item(), val_micro_f1, val_macro_f1))
if early_stop:
break
stopper.load_checkpoint(model)
test_loss, test_acc, test_micro_f1, test_macro_f1 = evaluate(model, g, features, labels, test_mask, loss_fcn)
print('Test loss {:.4f} | Test Micro f1 {:.4f} | Test Macro f1 {:.4f}'.format(
test_loss.item(), test_micro_f1, test_macro_f1))
if __name__ == '__main__':
import argparse
from utils import setup
parser = argparse.ArgumentParser('HAN')
parser.add_argument('-s', '--seed', type=int, default=1,
help='Random seed')
parser.add_argument('-ld', '--log-dir', type=str, default='results',
help='Dir for saving training results')
parser.add_argument('--hetero', action='store_true',
help='Use metapath coalescing with DGL's own dataset')
args = parser.parse_args().__dict__
args = setup(args)
main(args)
model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
# DGL是一个专门用于深度学习图形的Python包, 一款面向图神经网络以及图机器学习的全新框架, 简化了基于图形的神经网络的实现。
from dgl.nn.pytorch import GATConv
class SemanticAttention(nn.Module):
def __init__(self, in_size, hidden_size=128):
super(SemanticAttention, self).__init__()
self.project = nn.Sequential(#定义了2层的一个全连接层
nn.Linear(in_size, hidden_size),#定义了一个全连接层
nn.Tanh(),#激活函数
nn.Linear(hidden_size, 1, bias=False)
)
def forward(self, z):
w = self.project(z).mean(0) # 每个节点在meta_path维度的均值;mean(0):每个meta path上的均值(/|V|) # (M, 1)
beta = torch.softmax(w, dim=0) #归一化,之后得到 每个系数前面的beta # (M, 1)
beta = beta.expand((z.shape[0],) + beta.shape) # (N, M, 1)
#beta * z.shape ==> [3025,2,64] 2表示在两个meta——path下面,64即node_embedding时候输出的一个维度
return (beta * z).sum(1) # (N, D * K) 输出节点数量和节点最终的一个embedding
class HANLayer(nn.Module):
"""
HAN layer.
Arguments
---------
num_meta_paths : number of homogeneous graphs generated from the metapaths.
in_size : input feature dimension
out_size : output feature dimension
layer_num_heads : number of attention heads
dropout : Dropout probability
Inputs
------
g : list[DGLGraph]
List of graphs
h : tensor
Input features
Outputs
-------
tensor
The output feature
"""
def __init__(self, num_meta_paths, in_size, out_size, layer_num_heads, dropout):
super(HANLayer, self).__init__()
# One GAT layer for each meta path based adjacency matrix
self.gat_layers = nn.ModuleList()
for i in range(num_meta_paths): #meta_path Layers;两个meta_path的维度是一致的
#out_size 即规定的Hidden_layer的size
self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads,
dropout, dropout, activation=F.elu))
self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads)#语义attention
self.num_meta_paths = num_meta_paths
def forward(self, gs, h):
semantic_embeddings = []
for i, g in enumerate(gs):#每个meta_path的图信息,求节点的attention
semantic_embeddings.append(self.gat_layers[i](g, h).flatten(1))#两个gat,每个meta_path对应一个GAT
#self.gat_layers[i](g, h).shape得到的是torch.size([3025,8,8]) 3025是节点数量,8是多头,8是hidden_layer数量
#flatten之后就会组合成为3025*64
semantic_embeddings = torch.stack(semantic_embeddings, dim=1)#stack之后,就会变成[3025,2,64]2即表示有2个metapath
# (N, M, D * K)
#聚合meta_path下,每个节点最终的输出值
return self.semantic_attention(semantic_embeddings) # (N, D * K)
class HAN(nn.Module):
def __init__(self, num_meta_paths, in_size, hidden_size, out_size, num_heads, dropout):
super(HAN, self).__init__()
self.layers = nn.ModuleList()
#num_heads:即做几层的HAN,多头的列表的第0维
#nn.moduleList定义对象后,有extend和append方法,用法和python中一样,extend是添加另一个modulelist append是添加另一个module。
self.layers.append(HANLayer(num_meta_paths, in_size, hidden_size, num_heads[0], dropout))
for l in range(1, len(num_heads)):#多层多头,目前是没有
self.layers.append(HANLayer(num_meta_paths, hidden_size * num_heads[l-1],
hidden_size, num_heads[l], dropout))
self.predict = nn.Linear(hidden_size * num_heads[-1], out_size)#out_size是3,从outsize = num_classes=labels.shape[1]知道的
def forward(self, g, h):
for gnn in self.layers:#GAT_GAT 节点级别的GAT;semantic_attention语义级别attention
h = gnn(g, h)#g即多个meta_path下所形成的邻接矩阵;h即节点的特征
return self.predict(h)
utils.py
import datetime
import dgl
import errno
import numpy as np
import os
import pickle
import random
import torch
from dgl.data.utils import download, get_download_dir, _get_dgl_url
from pprint import pprint
from scipy import sparse
from scipy import io as sio
# utils 具体处理数据加载 和 早停策略。
def set_random_seed(seed=0):
"""Set random seed.
Parameters
----------
seed : int
Random seed to use
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
def mkdir_p(path, log=True):
"""Create a directory for the specified path.
Parameters
----------
path : str
Path name
log : bool
Whether to print result for directory creation
"""
try:
os.makedirs(path)
if log:
print('Created directory {}'.format(path))
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path) and log:
print('Directory {} already exists.'.format(path))
else:
raise
def get_date_postfix():
"""Get a date based postfix for directory name.
Returns
-------
post_fix : str
"""
dt = datetime.datetime.now()
post_fix = '{}_{:02d}-{:02d}-{:02d}'.format(
dt.date(), dt.hour, dt.minute, dt.second)
return post_fix
def setup_log_dir(args, sampling=False):
"""Name and create directory for logging.
Parameters
----------
args : dict
Configuration
Returns
-------
log_dir : str
Path for logging directory
sampling : bool
Whether we are using sampling based training
"""
date_postfix = get_date_postfix()
log_dir = os.path.join(
args['log_dir'],
'{}_{}'.format(args['dataset'], date_postfix))
if sampling:
log_dir = log_dir + '_sampling'
mkdir_p(log_dir)
return log_dir
# The configuration below is from the paper.
default_configure = {
'lr': 0.005, # Learning rate
'num_heads': [8], # Number of attention heads for node-level attention
'hidden_units': 8,
'dropout': 0.6,
'weight_decay': 0.001,
'num_epochs': 200,
'patience': 100
}
sampling_configure = {
'batch_size': 20
}
def setup(args):
args.update(default_configure)
set_random_seed(args['seed'])
args['dataset'] = 'ACMRaw' if args['hetero'] else 'ACM'
args['device'] = 'cuda:0' if torch.cuda.is_available() else 'cpu'
args['log_dir'] = setup_log_dir(args)
return args
def setup_for_sampling(args):
args.update(default_configure)
args.update(sampling_configure)
set_random_seed()
args['device'] = 'cuda:0' if torch.cuda.is_available() else 'cpu'
args['log_dir'] = setup_log_dir(args, sampling=True)
return args
def get_binary_mask(total_size, indices):
mask = torch.zeros(total_size)
mask[indices] = 1
return mask.byte()
def load_acm(remove_self_loop):
url = 'dataset/ACM3025.pkl'
data_path = get_download_dir() + '/ACM3025.pkl'
#download(_get_dgl_url(url), path=data_path) #下载过一次之后就不需要重复去下载了,就可以注释掉了
with open(data_path, 'rb') as f:
data = pickle.load(f)
#todense()即转换为数组,long() 函数将数字或字符串转换为一个长整型。
labels, features = torch.from_numpy(data['label'].todense()).long(),
torch.from_numpy(data['feature'].todense()).float()
num_classes = labels.shape[1]
labels = labels.nonzero()[:, 1]#将之前的one_hot编码转换成类别数字型
if remove_self_loop:#如果有环,就把环去除
num_nodes = data['label'].shape[0]
#np.eye()的函数,除了生成对角阵外,还可以将一个label数组,大小为(1,m)或者(m,1)的数组,转化成one-hot数组
#csr中r即行,行优先
data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes))
data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes))
# Adjacency matrices for meta path based neighbors
# (Mufei): I verified both of them are binary adjacency matrices with self loops
author_g = dgl.from_scipy(data['PAP'])#定义一个p-a-p的meta-path
subject_g = dgl.from_scipy(data['PLP'])#定义一个p-l-p的meta_path
gs = [author_g, subject_g] #将两个meta_path形成的图组合在一起
#从numpy数组创建一个张量,数组和张量共享相同内存。返回的张量和ndarray共享相同的内存。
#对张量的修改将反映在ndarray中,反之亦然。 返回的张量不可调整大小。
train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)
num_nodes = author_g.number_of_nodes()
#mask是把它对应位置上的节点设置为1,其余位置为0
train_mask = get_binary_mask(num_nodes, train_idx)
val_mask = get_binary_mask(num_nodes, val_idx)
test_mask = get_binary_mask(num_nodes, test_idx)
print('dataset loaded')
pprint({
'dataset': 'ACM',
'train': train_mask.sum().item() / num_nodes,
'val': val_mask.sum().item() / num_nodes,
'test': test_mask.sum().item() / num_nodes
})
return gs, features, labels, num_classes, train_idx, val_idx, test_idx,
train_mask, val_mask, test_mask
def load_acm_raw(remove_self_loop):
assert not remove_self_loop
url = 'dataset/ACM.mat'
data_path = get_download_dir() + '/ACM.mat'
download(_get_dgl_url(url), path=data_path)
data = sio.loadmat(data_path)
p_vs_l = data['PvsL'] # paper-field?
p_vs_a = data['PvsA'] # paper-author
p_vs_t = data['PvsT'] # paper-term, bag of words
p_vs_c = data['PvsC'] # paper-conference, labels come from that
# We assign
# (1) KDD papers as class 0 (data mining),
# (2) SIGMOD and VLDB papers as class 1 (database),
# (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
conf_ids = [0, 1, 9, 10, 13]
label_ids = [0, 1, 2, 2, 1]
p_vs_c_filter = p_vs_c[:, conf_ids]
p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
p_vs_l = p_vs_l[p_selected]
p_vs_a = p_vs_a[p_selected]
p_vs_t = p_vs_t[p_selected]
p_vs_c = p_vs_c[p_selected]
hg = dgl.heterograph({
('paper', 'pa', 'author'): p_vs_a.nonzero(),
('author', 'ap', 'paper'): p_vs_a.transpose().nonzero(),
('paper', 'pf', 'field'): p_vs_l.nonzero(),
('field', 'fp', 'paper'): p_vs_l.transpose().nonzero()
})
features = torch.FloatTensor(p_vs_t.toarray())
pc_p, pc_c = p_vs_c.nonzero()
labels = np.zeros(len(p_selected), dtype=np.int64)
for conf_id, label_id in zip(conf_ids, label_ids):
labels[pc_p[pc_c == conf_id]] = label_id
labels = torch.LongTensor(labels)
num_classes = 3
float_mask = np.zeros(len(pc_p))
for conf_id in conf_ids:
pc_c_mask = (pc_c == conf_id)
float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum()))
train_idx = np.where(float_mask <= 0.2)[0]
val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
test_idx = np.where(float_mask > 0.3)[0]
num_nodes = hg.number_of_nodes('paper')
train_mask = get_binary_mask(num_nodes, train_idx)
val_mask = get_binary_mask(num_nodes, val_idx)
test_mask = get_binary_mask(num_nodes, test_idx)
return hg, features, labels, num_classes, train_idx, val_idx, test_idx,
train_mask, val_mask, test_mask
def load_data(dataset, remove_self_loop=False):
if dataset == 'ACM':
return load_acm(remove_self_loop)
elif dataset == 'ACMRaw':
return load_acm_raw(remove_self_loop)
else:
return NotImplementedError('Unsupported dataset {}'.format(dataset))
class EarlyStopping(object):
def __init__(self, patience=10):
dt = datetime.datetime.now()
self.filename = 'early_stop_{}_{:02d}-{:02d}-{:02d}.pth'.format(
dt.date(), dt.hour, dt.minute, dt.second)
self.patience = patience
self.counter = 0
self.best_acc = None
self.best_loss = None
self.early_stop = False
def step(self, loss, acc, model):
if self.best_loss is None:
self.best_acc = acc
self.best_loss = loss
self.save_checkpoint(model)
elif (loss > self.best_loss) and (acc < self.best_acc):
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
if (loss <= self.best_loss) and (acc >= self.best_acc):
self.save_checkpoint(model)
self.best_loss = np.min((loss, self.best_loss))
self.best_acc = np.max((acc, self.best_acc))
self.counter = 0
return self.early_stop
def save_checkpoint(self, model):
"""Saves model when validation loss decreases."""
torch.save(model.state_dict(), self.filename)
def load_checkpoint(self, model):
"""Load the latest checkpoint."""
model.load_state_dict(torch.load(self.filename))



