Source code for deeprobust.image.attack.cw

import torch
from torch import optim
import torch.nn as nn
import numpy as np
import logging

from deeprobust.image.attack.base_attack import BaseAttack
from deeprobust.image.utils import onehot_like
from deeprobust.image.optimizer import AdamOptimizer

[docs]class CarliniWagner(BaseAttack): """ C&W attack is an effective method to calcuate high-confidence adversarial examples. References ---------- .. [1] Carlini, N., & Wagner, D. (2017, May). Towards evaluating the robustness of neural networks. https://arxiv.org/pdf/1608.04644.pdf This reimplementation is based on https://github.com/kkew3/pytorch-cw2 Copyright 2018 Kaiwen Wu Examples -------- >>> from deeprobust.image.attack.cw import CarliniWagner >>> from deeprobust.image.netmodels.CNN import Net >>> from deeprobust.image.config import attack_params >>> model = Net() >>> model.load_state_dict(torch.load("./trained_models/MNIST_CNN_epoch_20.pt", map_location = torch.device('cuda'))) >>> model.eval() >>> x,y = datasets.MNIST() >>> attack = CarliniWagner(model, device='cuda') >>> AdvExArray = attack.generate(x, y, target_label = 1, classnum = 10, **attack_params['CW_MNIST]) """ def __init__(self, model, device = 'cuda'): super(CarliniWagner, self).__init__(model, device) self.model = model self.device = device
[docs] def generate(self, image, label, target_label, **kwargs): """ Call this function to generate adversarial examples. Parameters ---------- image : original image label : target label kwargs : user defined paremeters """ assert self.check_type_device(image, label) assert self.parse_params(**kwargs) self.target = target_label return self.cw(self.model, self.image, self.label, self.target, self.confidence, self.clip_max, self.clip_min, self.max_iterations, self.initial_const, self.binary_search_steps, self.learning_rate )
[docs] def parse_params(self, classnum = 10, confidence = 1e-4, clip_max = 1, clip_min = 0, max_iterations = 1000, initial_const = 1e-2, binary_search_steps = 5, learning_rate = 0.00001, abort_early = True): """ Parse the user defined parameters. Parameters ---------- classnum : number of class confidence : confidence clip_max : maximum pixel value clip_min : minimum pixel value max_iterations : maximum number of iterations initial_const : initialization of binary search binary_search_steps : step number of binary search learning_rate : learning rate abort_early : Set abort_early = True to allow early stop """ self.classnum = classnum self.confidence = confidence self.clip_max = clip_max self.clip_min = clip_min self.max_iterations = max_iterations self.initial_const = initial_const self.binary_search_steps = binary_search_steps self.learning_rate = learning_rate self.abort_early = abort_early return True
def cw(self, model, image, label, target, confidence, clip_max, clip_min, max_iterations, initial_const, binary_search_steps, learning_rate): #change the input image img_tanh = self.to_attack_space(image.cpu()) img_ori ,_ = self.to_model_space(img_tanh) img_ori = img_ori.to(self.device) #binary search initialization c = initial_const c_low = 0 c_high = np.inf found_adv = False last_loss = np.inf for step in range(binary_search_steps): #initialize w : perturbed image in tanh space w = torch.from_numpy(img_tanh.numpy()) optimizer = AdamOptimizer(img_tanh.shape) is_adversarial = False for iteration in range(max_iterations): # adversary example img_adv, adv_grid = self.to_model_space(w) img_adv = img_adv.to(self.device) img_adv.requires_grad = True #output of the layer before softmax output = model.get_logits(img_adv) #pending success is_adversarial = self.pending_f(img_adv) #calculate loss function and gradient of loss funcition on x loss, loss_grad = self.loss_function( img_adv, c, self.target, img_ori, self.confidence, self.clip_min, self.clip_max ) #calculate gradient of loss function on w gradient = adv_grid.to(self.device) * loss_grad.to(self.device) w = w + torch.from_numpy(optimizer(gradient.cpu().detach().numpy(), learning_rate)).float() if is_adversarial: found_adv = True #do binary search on c if found_adv: c_high = c else: c_low = c if c_high == np.inf: c *= 10 else: c = (c_high + c_low) / 2 if (step % 10 == 0): print("iteration:{:.0f},loss:{:.4f}".format(step,loss)) # if (step == 50): # learning_rate = learning_rate/100 #abort early if(self.abort_early == True and (step % 10) == 0 and step > 100) : print("early abortion?", loss, last_loss) if not (loss <= 0.9999 * last_loss): break last_loss = loss return img_adv.detach()
[docs] def loss_function( self, x_p, const, target, reconstructed_original, confidence, min_, max_): """Returns the loss and the gradient of the loss w.r.t. x, assuming that logits = model(x).""" ## get the output of model before softmax x_p.requires_grad = True logits = self.model.get_logits(x_p).to(self.device) ## find the largest class except the target class targetlabel_mask = (torch.from_numpy(onehot_like(np.zeros(self.classnum), target))).double() secondlargest_mask = (torch.from_numpy(np.ones(self.classnum)) - targetlabel_mask).to(self.device) secondlargest = np.argmax((logits.double() * secondlargest_mask).cpu().detach().numpy()) is_adv_loss = logits[0][secondlargest] - logits[0][target] # is_adv is True as soon as the is_adv_loss goes below 0 # but sometimes we want additional confidence is_adv_loss += confidence if is_adv_loss == 0: is_adv_loss_grad = 0 else: is_adv_loss.backward() is_adv_loss_grad = x_p.grad is_adv_loss = max(0, is_adv_loss) s = max_ - min_ squared_l2_distance = np.sum( ((x_p - reconstructed_original) ** 2).cpu().detach().numpy() ) / s ** 2 total_loss = squared_l2_distance + const * is_adv_loss squared_l2_distance_grad = (2 / s ** 2) * (x_p - reconstructed_original) #print(is_adv_loss_grad) total_loss_grad = squared_l2_distance_grad + const * is_adv_loss_grad return total_loss, total_loss_grad
[docs] def pending_f(self, x_p): """Pending is the loss function is less than 0 """ targetlabel_mask = torch.from_numpy(onehot_like(np.zeros(self.classnum), self.target)) secondlargest_mask = torch.from_numpy(np.ones(self.classnum)) - targetlabel_mask targetlabel_mask = targetlabel_mask.to(self.device) secondlargest_mask = secondlargest_mask.to(self.device) Zx_i = np.max((self.model.get_logits(x_p).double().to(self.device) * secondlargest_mask).cpu().detach().numpy()) Zx_t = np.max((self.model.get_logits(x_p).double().to(self.device) * targetlabel_mask).cpu().detach().numpy()) if ( Zx_i - Zx_t < - self.confidence): return True else: return False
def to_attack_space(self, x): x = x.detach() # map from [min_, max_] to [-1, +1] # x'=(x- 0.5 * (max+min) / 0.5 * (max-min)) a = (self.clip_min + self.clip_max) / 2 b = (self.clip_max - self.clip_min) / 2 x = (x - a) / b # from [-1, +1] to approx. (-1, +1) x = x * 0.999999 # from (-1, +1) to (-inf, +inf) return np.arctanh(x)
[docs] def to_model_space(self, x): """Transforms an input from the attack space to the model space. This transformation and the returned gradient are elementwise.""" # from (-inf, +inf) to (-1, +1) x = np.tanh(x) grad = 1 - np.square(x) # map from (-1, +1) to (min_, max_) a = (self.clip_min + self.clip_max) / 2 b = (self.clip_max - self.clip_min) / 2 x = x * b + a grad = grad * b return x, grad