I have implemented a GraphSAGE model
using dgl for link prediction. On average the auc score of the model is ~0.7 but the score varies a lot for different runs. Even though I am training and testing on the same data. I am not
splitting the data randomly, the data is fixed, and still I got AUC scores ranging from 0.2-0.85. I could not figure out where is the randomness is generating from. Is the model itself work in a random way?
Here is my code:
import random
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import pandas as pd
import dgl.function as fn
add_param_len= True
add_arg_len = True
add_name = True
class DotPredictor(nn.Module):
def forward(self, g, h):
with g.local_scope():
# print("====>", h)
g.ndata['h'] = h
# Compute a new edge feature named 'score' by a dot-product between the
# source node feature 'h' and destination node feature 'h'.
g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
# u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
# print(g.edata['score'][:, 0])
return g.edata['score'][:, 0]
from dgl.nn import SAGEConv
# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
def __init__(self, in_feats, h_feats):
super(GraphSAGE, self).__init__()
self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
def forward(self, g, in_feat):
# print(g)
# print(in_feat)
h = self.conv1(g, in_feat)
# print(h)
h = F.relu(h)
h = self.conv2(g, h)
return h
class MLPPredictor(nn.Module):
def __init__(self, h_feats):
super().__init__()
self.W1 = nn.Linear(h_feats * 2, h_feats)
self.W2 = nn.Linear(h_feats, 1)
def apply_edges(self, edges):
"""
Computes a scalar score for each edge of the given graph.
Parameters
----------
edges :
Has three members ``src``, ``dst`` and ``data``, each of
which is a dictionary representing the features of the
source nodes, the destination nodes, and the edges
themselves.
Returns
-------
dict
A dictionary of new edge features.
"""
# print("here!!!!!")
h = torch.cat([edges.src['h'], edges.dst['h']], 1)
return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}
def forward(self, g, h):
with g.local_scope():
g.ndata['h'] = h
g.apply_edges(self.apply_edges)
return g.edata['score']
df = pd.read_csv('nodes.csv')
# print(df)
nodes_data = df.drop(["start_line","start_column","end_line", "end_column" ,"file_name"], axis=1)
# nodes_data
df = pd.read_csv('edges.csv', header=None)
# df
train_len = 3432
test_df = df[train_len:]
edges_data = df[:train_len]
# edges_data = df
import dgl
src = edges_data[0].to_numpy()
dst = edges_data[1].to_numpy()
# Create a DGL graph from a pair of numpy arrays
g = dgl.graph((src, dst))
# g
y= nodes_data['type'].tolist()
from sklearn import preprocessing
import torch
le = preprocessing.LabelEncoder()
targets = le.fit_transform(y)
targets = torch.as_tensor(targets)
targets = targets.type(torch.LongTensor)
type_one_hot = F.one_hot(targets)
g.ndata['x_one_hot'] = type_one_hot
y=torch.tensor(targets, dtype=torch.float32)
x=y.view(len(nodes_data),1)
g.ndata['x'] = x
if add_param_len:
le = preprocessing.LabelEncoder()
y=nodes_data['params_len'].tolist()
targets = le.fit_transform(y)
# print([x for x in targets if x!=0])
targets = torch.as_tensor(targets)
targets = targets.type(torch.LongTensor)
param_len_one_hot = F.one_hot(targets)
# param_len_one_hot
g.ndata['param_len_one_hot'] = param_len_one_hot
g.ndata['param_len'] = targets
if add_arg_len:
le = preprocessing.LabelEncoder()
y=nodes_data['argument_len'].tolist()
targets = le.fit_transform(y)
# print([x for x in targets if x!=0])
targets = torch.as_tensor(targets)
targets = targets.type(torch.LongTensor)
args_len_one_hot = F.one_hot(targets)
# param_len_one_hot
g.ndata['args_len_one_hot'] = args_len_one_hot
g.ndata['args_len'] = targets
if add_name:
le = preprocessing.LabelEncoder()
y=nodes_data['name'].tolist()
targets = le.fit_transform(y)
# targets
# for x in targets:
# print(x)
# # print([x for x in targets if x!=0])
targets = torch.as_tensor(targets)
targets = targets.type(torch.LongTensor)
# args_len_one_hot = F.one_hot(targets)
# param_len_one_hot
g.ndata['name'] = targets
# g.ndata['args_len'] = targets
u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = len(test_df)
train_size = g.number_of_edges()
test_pos_u, test_pos_v = torch.tensor(test_df[0].to_numpy()), torch.tensor(test_df[1].to_numpy())
print(test_pos_u)
train_pos_u, train_pos_v = u,v
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(g.number_of_nodes(), g.number_of_nodes()))
print(adj.shape)
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
set(nodes_data['type'])
stmt_type=['FunctionDeclaration', 'ArrowFunctionExpression', 'FunctionExpression']
df1 = nodes_data[nodes_data.type.isin(stmt_type)]
test_neg_id = df1['id'].tolist()
# print(test_neg_id)
df1 = nodes_data[nodes_data['type']=='CallExpression']
test_neg_call_site = df1['id'].tolist()
# print(test_neg_call_site)
all_combi = [(x, y) for x in test_neg_call_site for y in test_neg_id]
print(len(all_combi))
test_data = df[3402:]
pair_list = list(zip(test_data[0].tolist(),test_data[1].tolist()))
print(pair_list)
test_neg_u, test_neg_v = [], []
for pair_ in all_combi:
# print(pair_)
if pair_ not in pair_list:
test_neg_u = np.append(test_neg_u,[pair_[0]])
test_neg_v = np.append(test_neg_v,[pair_[1]])
print(len(test_neg_u))
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
# train_g = dgl.remove_edges(g, eids[:test_size])
train_g = g
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())
test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
print(test_pos_g)
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())
print(test_neg_g)
from sklearn.metrics import roc_auc_score
model = GraphSAGE(g.ndata['x'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
pred = MLPPredictor(16)
# pred = DotPredictor()
def compute_loss(pos_score, neg_score):
# print("here!")
scores = torch.cat([pos_score, neg_score])
labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
return F.binary_cross_entropy_with_logits(scores, labels)
def compute_auc(pos_score, neg_score):
# print(len(neg_score))
scores = torch.cat([pos_score, neg_score]).numpy()
# print("scores =====> ", scores, len(scores))
labels = torch.cat(
[torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
# print(labels)
return roc_auc_score(labels, scores)
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
# print(model.parameters)
# print(pred.parameters)
# print(train_g.ndata['x'])
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(150):
# forward
h = model(train_g, train_g.ndata['x'])
# print("this ok======>")
pos_score = pred(train_pos_g, h)
neg_score = pred(train_neg_g, h)
loss = compute_loss(pos_score, neg_score)
# backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
if e % 5 == 0:
print('In epoch {}, loss: {}'.format(e, loss))
from sklearn.metrics import roc_auc_score
with torch.no_grad():
pos_score = pred(test_pos_g, h)
# print(pos_score)
neg_score = pred(test_neg_g, h)
# print(neg_score)
print('AUC', compute_auc(pos_score, neg_score))
from sklearn.metrics import roc_curve
scores = torch.cat([pos_score, neg_score]).numpy()
labels = torch.cat(
[torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(labels, scores, pos_label=1)
# print(thresh1)
# print(fpr1)
# print(tpr1)
# print("threshold ====> ",thresh1)
import matplotlib.pyplot as plt
plt.style.use('seaborn')
# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
# plt.plot(fpr2, tpr2, linestyle='--',color='green', label='KNN')
# plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.legend(loc='best')
# plt.savefig('ROC_with_all_features',dpi=300)
plt.show();