Source code for deeprobust.graph.defense.gcn_preprocess

import torch.nn as nn
import torch.nn.functional as F
import math
import torch
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
from deeprobust.graph import utils
from deeprobust.graph.defense import GCN
from tqdm import tqdm
import scipy.sparse as sp
import numpy as np
from numba import njit

[docs]class GCNSVD(GCN): """GCNSVD is a 2 Layer Graph Convolutional Network with Truncated SVD as preprocessing. See more details in All You Need Is Low (Rank): Defending Against Adversarial Attacks on Graphs, https://dl.acm.org/doi/abs/10.1145/3336191.3371789. Parameters ---------- nfeat : int size of input feature dimension nhid : int number of hidden units nclass : int size of output dimension dropout : float dropout rate for GCN lr : float learning rate for GCN weight_decay : float weight decay coefficient (l2 normalization) for GCN. When `with_relu` is True, `weight_decay` will be set to 0. with_relu : bool whether to use relu activation function. If False, GCN will be linearized. with_bias: bool whether to include bias term in GCN weights. device: str 'cpu' or 'cuda'. Examples -------- We can first load dataset and then train GCNSVD. >>> from deeprobust.graph.data import PrePtbDataset, Dataset >>> from deeprobust.graph.defense import GCNSVD >>> # load clean graph data >>> data = Dataset(root='/tmp/', name='cora', seed=15) >>> adj, features, labels = data.adj, data.features, data.labels >>> idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test >>> # load perturbed graph data >>> perturbed_data = PrePtbDataset(root='/tmp/', name='cora') >>> perturbed_adj = perturbed_data.adj >>> # train defense model >>> model = GCNSVD(nfeat=features.shape[1], nhid=16, nclass=labels.max().item() + 1, dropout=0.5, device='cpu').to('cpu') >>> model.fit(features, perturbed_adj, labels, idx_train, idx_val, k=20) """ def __init__(self, nfeat, nhid, nclass, dropout=0.5, lr=0.01, weight_decay=5e-4, with_relu=True, with_bias=True, device='cpu'): super(GCNSVD, self).__init__(nfeat, nhid, nclass, dropout, lr, weight_decay, with_relu, with_bias, device=device) self.device = device self.k = None
[docs] def fit(self, features, adj, labels, idx_train, idx_val=None, k=50, train_iters=200, initialize=True, verbose=True, **kwargs): """First perform rank-k approximation of adjacency matrix via truncated SVD, and then train the gcn model on the processed graph, when idx_val is not None, pick the best model according to the validation loss. Parameters ---------- features : node features adj : the adjacency matrix. The format could be torch.tensor or scipy matrix labels : node labels idx_train : node training indices idx_val : node validation indices. If not given (None), GCN training process will not adpot early stopping k : int number of singular values and vectors to compute. train_iters : int number of training epochs initialize : bool whether to initialize parameters before training verbose : bool whether to show verbose logs """ modified_adj = self.truncatedSVD(adj, k=k) self.k = k # modified_adj_tensor = utils.sparse_mx_to_torch_sparse_tensor(self.modified_adj) features, modified_adj, labels = utils.to_tensor(features, modified_adj, labels, device=self.device) self.modified_adj = modified_adj self.features = features self.labels = labels super().fit(features, modified_adj, labels, idx_train, idx_val, train_iters=train_iters, initialize=initialize, verbose=verbose)
[docs] def truncatedSVD(self, data, k=50): """Truncated SVD on input data. Parameters ---------- data : input matrix to be decomposed k : int number of singular values and vectors to compute. Returns ------- numpy.array reconstructed matrix. """ print('=== GCN-SVD: rank={} ==='.format(k)) if sp.issparse(data): data = data.asfptype() U, S, V = sp.linalg.svds(data, k=k) print("rank_after = {}".format(len(S.nonzero()[0]))) diag_S = np.diag(S) else: U, S, V = np.linalg.svd(data) U = U[:, :k] S = S[:k] V = V[:k, :] print("rank_before = {}".format(len(S.nonzero()[0]))) diag_S = np.diag(S) print("rank_after = {}".format(len(diag_S.nonzero()[0]))) return U @ diag_S @ V
[docs] def predict(self, features=None, adj=None): """By default, the inputs should be unnormalized adjacency Parameters ---------- features : node features. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions. adj : adjcency matrix. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions. Returns ------- torch.FloatTensor output (log probabilities) of GCNSVD """ self.eval() if features is None and adj is None: return self.forward(self.features, self.adj_norm) else: adj = self.truncatedSVD(adj, k=self.k) if type(adj) is not torch.Tensor: features, adj = utils.to_tensor(features, adj, device=self.device) self.features = features if utils.is_sparse_tensor(adj): self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True) else: self.adj_norm = utils.normalize_adj_tensor(adj) return self.forward(self.features, self.adj_norm)
[docs]class GCNJaccard(GCN): """GCNJaccard first preprocesses input graph via droppining dissimilar edges and train a GCN based on the processed graph. See more details in Adversarial Examples on Graph Data: Deep Insights into Attack and Defense, https://arxiv.org/pdf/1903.01610.pdf. Parameters ---------- nfeat : int size of input feature dimension nhid : int number of hidden units nclass : int size of output dimension dropout : float dropout rate for GCN lr : float learning rate for GCN weight_decay : float weight decay coefficient (l2 normalization) for GCN. When `with_relu` is True, `weight_decay` will be set to 0. with_relu : bool whether to use relu activation function. If False, GCN will be linearized. with_bias: bool whether to include bias term in GCN weights. device: str 'cpu' or 'cuda'. Examples -------- We can first load dataset and then train GCNJaccard. >>> from deeprobust.graph.data import PrePtbDataset, Dataset >>> from deeprobust.graph.defense import GCNJaccard >>> # load clean graph data >>> data = Dataset(root='/tmp/', name='cora', seed=15) >>> adj, features, labels = data.adj, data.features, data.labels >>> idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test >>> # load perturbed graph data >>> perturbed_data = PrePtbDataset(root='/tmp/', name='cora') >>> perturbed_adj = perturbed_data.adj >>> # train defense model >>> model = GCNJaccard(nfeat=features.shape[1], nhid=16, nclass=labels.max().item() + 1, dropout=0.5, device='cpu').to('cpu') >>> model.fit(features, perturbed_adj, labels, idx_train, idx_val, threshold=0.03) """ def __init__(self, nfeat, nhid, nclass, binary_feature=True, dropout=0.5, lr=0.01, weight_decay=5e-4, with_relu=True, with_bias=True, device='cpu'): super(GCNJaccard, self).__init__(nfeat, nhid, nclass, dropout, lr, weight_decay, with_relu, with_bias, device=device) self.device = device self.binary_feature = binary_feature
[docs] def fit(self, features, adj, labels, idx_train, idx_val=None, threshold=0.01, train_iters=200, initialize=True, verbose=True, **kwargs): """First drop dissimilar edges with similarity smaller than given threshold and then train the gcn model on the processed graph. When idx_val is not None, pick the best model according to the validation loss. Parameters ---------- features : node features. The format can be numpy.array or scipy matrix adj : the adjacency matrix. labels : node labels idx_train : node training indices idx_val : node validation indices. If not given (None), GCN training process will not adpot early stopping threshold : float similarity threshold for dropping edges. If two connected nodes with similarity smaller than threshold, the edge between them will be removed. train_iters : int number of training epochs initialize : bool whether to initialize parameters before training verbose : bool whether to show verbose logs """ self.threshold = threshold modified_adj = self.drop_dissimilar_edges(features, adj) # modified_adj_tensor = utils.sparse_mx_to_torch_sparse_tensor(self.modified_adj) features, modified_adj, labels = utils.to_tensor(features, modified_adj, labels, device=self.device) self.modified_adj = modified_adj self.features = features self.labels = labels super().fit(features, modified_adj, labels, idx_train, idx_val, train_iters=train_iters, initialize=initialize, verbose=verbose)
[docs] def drop_dissimilar_edges(self, features, adj, metric='similarity'): """Drop dissimilar edges.(Faster version using numba) """ if not sp.issparse(adj): adj = sp.csr_matrix(adj) adj_triu = sp.triu(adj, format='csr') if sp.issparse(features): features = features.todense().A # make it easier for njit processing if metric == 'distance': removed_cnt = dropedge_dis(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold) else: if self.binary_feature: removed_cnt = dropedge_jaccard(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold) else: removed_cnt = dropedge_cosine(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold) print('removed %s edges in the original graph' % removed_cnt) modified_adj = adj_triu + adj_triu.transpose() return modified_adj
[docs] def predict(self, features=None, adj=None): """By default, the inputs should be unnormalized adjacency Parameters ---------- features : node features. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions. adj : adjcency matrix. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions. Returns ------- torch.FloatTensor output (log probabilities) of GCNJaccard """ self.eval() if features is None and adj is None: return self.forward(self.features, self.adj_norm) else: adj = self.drop_dissimilar_edges(features, adj) if type(adj) is not torch.Tensor: features, adj = utils.to_tensor(features, adj, device=self.device) self.features = features if utils.is_sparse_tensor(adj): self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True) else: self.adj_norm = utils.normalize_adj_tensor(adj) return self.forward(self.features, self.adj_norm)
def _drop_dissimilar_edges(self, features, adj): """Drop dissimilar edges. (Slower version) """ if not sp.issparse(adj): adj = sp.csr_matrix(adj) modified_adj = adj.copy().tolil() # preprocessing based on features print('=== GCN-Jaccrad ===') edges = np.array(modified_adj.nonzero()).T removed_cnt = 0 for edge in tqdm(edges): n1 = edge[0] n2 = edge[1] if n1 > n2: continue if self.binary_feature: J = self._jaccard_similarity(features[n1], features[n2]) if J < self.threshold: modified_adj[n1, n2] = 0 modified_adj[n2, n1] = 0 removed_cnt += 1 else: # For not binary feature, use cosine similarity C = self._cosine_similarity(features[n1], features[n2]) if C < self.threshold: modified_adj[n1, n2] = 0 modified_adj[n2, n1] = 0 removed_cnt += 1 print('removed %s edges in the original graph' % removed_cnt) return modified_adj def _jaccard_similarity(self, a, b): intersection = a.multiply(b).count_nonzero() J = intersection * 1.0 / (a.count_nonzero() + b.count_nonzero() - intersection) return J def _cosine_similarity(self, a, b): inner_product = (a * b).sum() C = inner_product / (np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()) + 1e-10) return C
def __dropedge_jaccard(A, iA, jA, features, threshold): # deprecated: for sparse feature matrix... removed_cnt = 0 for row in range(len(iA)-1): for i in range(iA[row], iA[row+1]): # print(row, jA[i], A[i]) n1 = row n2 = jA[i] a, b = features[n1], features[n2] intersection = a.multiply(b).count_nonzero() J = intersection * 1.0 / (a.count_nonzero() + b.count_nonzero() - intersection) if J < threshold: A[i] = 0 # A[n2, n1] = 0 removed_cnt += 1 return removed_cnt @njit def dropedge_jaccard(A, iA, jA, features, threshold): removed_cnt = 0 for row in range(len(iA)-1): for i in range(iA[row], iA[row+1]): # print(row, jA[i], A[i]) n1 = row n2 = jA[i] a, b = features[n1], features[n2] intersection = np.count_nonzero(a*b) J = intersection * 1.0 / (np.count_nonzero(a) + np.count_nonzero(b) - intersection) if J < threshold: A[i] = 0 # A[n2, n1] = 0 removed_cnt += 1 return removed_cnt @njit def dropedge_cosine(A, iA, jA, features, threshold): removed_cnt = 0 for row in range(len(iA)-1): for i in range(iA[row], iA[row+1]): # print(row, jA[i], A[i]) n1 = row n2 = jA[i] a, b = features[n1], features[n2] inner_product = (a * b).sum() C = inner_product / (np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()) + 1e-8) if C < threshold: A[i] = 0 # A[n2, n1] = 0 removed_cnt += 1 return removed_cnt @njit def dropedge_dis(A, iA, jA, features, threshold): removed_cnt = 0 for row in range(len(iA)-1): for i in range(iA[row], iA[row+1]): # print(row, jA[i], A[i]) n1 = row n2 = jA[i] C = np.linalg.norm(features[n1] - features[n2]) if C > threshold: A[i] = 0 # A[n2, n1] = 0 removed_cnt += 1 return removed_cnt @njit def dropedge_both(A, iA, jA, features, threshold1=2.5, threshold2=0.01): removed_cnt = 0 for row in range(len(iA)-1): for i in range(iA[row], iA[row+1]): # print(row, jA[i], A[i]) n1 = row n2 = jA[i] C1 = np.linalg.norm(features[n1] - features[n2]) a, b = features[n1], features[n2] inner_product = (a * b).sum() C2 = inner_product / (np.sqrt(np.square(a).sum() + np.square(b).sum())+ 1e-6) if C1 > threshold1 or threshold2 < 0: A[i] = 0 # A[n2, n1] = 0 removed_cnt += 1 return removed_cnt if __name__ == "__main__": from deeprobust.graph.data import PrePtbDataset, Dataset # load clean graph data dataset_str = 'pubmed' data = Dataset(root='/tmp/', name=dataset_str, seed=15) adj, features, labels = data.adj, data.features, data.labels idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test # load perturbed graph data perturbed_data = PrePtbDataset(root='/tmp/', name=dataset_str) perturbed_adj = perturbed_data.adj # train defense model print("Test GCNJaccard") model = GCNJaccard(nfeat=features.shape[1], nhid=16, nclass=labels.max().item() + 1, binary_feature=False, dropout=0.5, device='cuda').to('cuda') model.fit(features, perturbed_adj, labels, idx_train, idx_val, threshold=0.1) model.test(idx_test) prediction_1 = model.predict() prediction_2 = model.predict(features, perturbed_adj) assert (prediction_1 != prediction_2).sum() == 0 print("Test GCNSVD") model = GCNSVD(nfeat=features.shape[1], nhid=16, nclass=labels.max().item() + 1, dropout=0.5, device='cuda').to('cuda') model.fit(features, perturbed_adj, labels, idx_train, idx_val, k=20) model.test(idx_test) prediction_1 = model.predict() prediction_2 = model.predict(features, perturbed_adj) assert (prediction_1 - prediction_2).mean() < 1e-5