import torch
from torch import optim
import torch.nn as nn
import numpy as np
import logging
from deeprobust.image.attack.base_attack import BaseAttack
from deeprobust.image.utils import onehot_like
from deeprobust.image.optimizer import AdamOptimizer
[docs]class CarliniWagner(BaseAttack):
"""
C&W attack is an effective method to calcuate high-confidence adversarial examples.
References
----------
.. [1] Carlini, N., & Wagner, D. (2017, May). Towards evaluating the robustness of neural networks. https://arxiv.org/pdf/1608.04644.pdf
This reimplementation is based on https://github.com/kkew3/pytorch-cw2
Copyright 2018 Kaiwen Wu
Examples
--------
>>> from deeprobust.image.attack.cw import CarliniWagner
>>> from deeprobust.image.netmodels.CNN import Net
>>> from deeprobust.image.config import attack_params
>>> model = Net()
>>> model.load_state_dict(torch.load("./trained_models/MNIST_CNN_epoch_20.pt", map_location = torch.device('cuda')))
>>> model.eval()
>>> x,y = datasets.MNIST()
>>> attack = CarliniWagner(model, device='cuda')
>>> AdvExArray = attack.generate(x, y, target_label = 1, classnum = 10, **attack_params['CW_MNIST])
"""
def __init__(self, model, device = 'cuda'):
super(CarliniWagner, self).__init__(model, device)
self.model = model
self.device = device
[docs] def generate(self, image, label, target_label, **kwargs):
"""
Call this function to generate adversarial examples.
Parameters
----------
image :
original image
label :
target label
kwargs :
user defined paremeters
"""
assert self.check_type_device(image, label)
assert self.parse_params(**kwargs)
self.target = target_label
return self.cw(self.model,
self.image,
self.label,
self.target,
self.confidence,
self.clip_max,
self.clip_min,
self.max_iterations,
self.initial_const,
self.binary_search_steps,
self.learning_rate
)
[docs] def parse_params(self,
classnum = 10,
confidence = 1e-4,
clip_max = 1,
clip_min = 0,
max_iterations = 1000,
initial_const = 1e-2,
binary_search_steps = 5,
learning_rate = 0.00001,
abort_early = True):
"""
Parse the user defined parameters.
Parameters
----------
classnum :
number of class
confidence :
confidence
clip_max :
maximum pixel value
clip_min :
minimum pixel value
max_iterations :
maximum number of iterations
initial_const :
initialization of binary search
binary_search_steps :
step number of binary search
learning_rate :
learning rate
abort_early :
Set abort_early = True to allow early stop
"""
self.classnum = classnum
self.confidence = confidence
self.clip_max = clip_max
self.clip_min = clip_min
self.max_iterations = max_iterations
self.initial_const = initial_const
self.binary_search_steps = binary_search_steps
self.learning_rate = learning_rate
self.abort_early = abort_early
return True
def cw(self, model, image, label, target, confidence, clip_max, clip_min, max_iterations, initial_const, binary_search_steps, learning_rate):
#change the input image
img_tanh = self.to_attack_space(image.cpu())
img_ori ,_ = self.to_model_space(img_tanh)
img_ori = img_ori.to(self.device)
#binary search initialization
c = initial_const
c_low = 0
c_high = np.inf
found_adv = False
last_loss = np.inf
for step in range(binary_search_steps):
#initialize w : perturbed image in tanh space
w = torch.from_numpy(img_tanh.numpy())
optimizer = AdamOptimizer(img_tanh.shape)
is_adversarial = False
for iteration in range(max_iterations):
# adversary example
img_adv, adv_grid = self.to_model_space(w)
img_adv = img_adv.to(self.device)
img_adv.requires_grad = True
#output of the layer before softmax
output = model.get_logits(img_adv)
#pending success
is_adversarial = self.pending_f(img_adv)
#calculate loss function and gradient of loss funcition on x
loss, loss_grad = self.loss_function(
img_adv, c, self.target, img_ori, self.confidence, self.clip_min, self.clip_max
)
#calculate gradient of loss function on w
gradient = adv_grid.to(self.device) * loss_grad.to(self.device)
w = w + torch.from_numpy(optimizer(gradient.cpu().detach().numpy(), learning_rate)).float()
if is_adversarial:
found_adv = True
#do binary search on c
if found_adv:
c_high = c
else:
c_low = c
if c_high == np.inf:
c *= 10
else:
c = (c_high + c_low) / 2
if (step % 10 == 0):
print("iteration:{:.0f},loss:{:.4f}".format(step,loss))
# if (step == 50):
# learning_rate = learning_rate/100
#abort early
if(self.abort_early == True and (step % 10) == 0 and step > 100) :
print("early abortion?", loss, last_loss)
if not (loss <= 0.9999 * last_loss):
break
last_loss = loss
return img_adv.detach()
[docs] def loss_function(
self, x_p, const, target, reconstructed_original, confidence, min_, max_):
"""Returns the loss and the gradient of the loss w.r.t. x,
assuming that logits = model(x)."""
## get the output of model before softmax
x_p.requires_grad = True
logits = self.model.get_logits(x_p).to(self.device)
## find the largest class except the target class
targetlabel_mask = (torch.from_numpy(onehot_like(np.zeros(self.classnum), target))).double()
secondlargest_mask = (torch.from_numpy(np.ones(self.classnum)) - targetlabel_mask).to(self.device)
secondlargest = np.argmax((logits.double() * secondlargest_mask).cpu().detach().numpy())
is_adv_loss = logits[0][secondlargest] - logits[0][target]
# is_adv is True as soon as the is_adv_loss goes below 0
# but sometimes we want additional confidence
is_adv_loss += confidence
if is_adv_loss == 0:
is_adv_loss_grad = 0
else:
is_adv_loss.backward()
is_adv_loss_grad = x_p.grad
is_adv_loss = max(0, is_adv_loss)
s = max_ - min_
squared_l2_distance = np.sum( ((x_p - reconstructed_original) ** 2).cpu().detach().numpy() ) / s ** 2
total_loss = squared_l2_distance + const * is_adv_loss
squared_l2_distance_grad = (2 / s ** 2) * (x_p - reconstructed_original)
#print(is_adv_loss_grad)
total_loss_grad = squared_l2_distance_grad + const * is_adv_loss_grad
return total_loss, total_loss_grad
[docs] def pending_f(self, x_p):
"""Pending is the loss function is less than 0
"""
targetlabel_mask = torch.from_numpy(onehot_like(np.zeros(self.classnum), self.target))
secondlargest_mask = torch.from_numpy(np.ones(self.classnum)) - targetlabel_mask
targetlabel_mask = targetlabel_mask.to(self.device)
secondlargest_mask = secondlargest_mask.to(self.device)
Zx_i = np.max((self.model.get_logits(x_p).double().to(self.device) * secondlargest_mask).cpu().detach().numpy())
Zx_t = np.max((self.model.get_logits(x_p).double().to(self.device) * targetlabel_mask).cpu().detach().numpy())
if ( Zx_i - Zx_t < - self.confidence):
return True
else:
return False
def to_attack_space(self, x):
x = x.detach()
# map from [min_, max_] to [-1, +1]
# x'=(x- 0.5 * (max+min) / 0.5 * (max-min))
a = (self.clip_min + self.clip_max) / 2
b = (self.clip_max - self.clip_min) / 2
x = (x - a) / b
# from [-1, +1] to approx. (-1, +1)
x = x * 0.999999
# from (-1, +1) to (-inf, +inf)
return np.arctanh(x)
[docs] def to_model_space(self, x):
"""Transforms an input from the attack space
to the model space. This transformation and
the returned gradient are elementwise."""
# from (-inf, +inf) to (-1, +1)
x = np.tanh(x)
grad = 1 - np.square(x)
# map from (-1, +1) to (min_, max_)
a = (self.clip_min + self.clip_max) / 2
b = (self.clip_max - self.clip_min) / 2
x = x * b + a
grad = grad * b
return x, grad