# Imports
import logging
import math
import random
from tqdm import tqdm
# Torch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
# Custom package imports
from .decoders import DecoderAttention, DecoderEvent
from .embedding import EmbeddingOneHot
from .encoders import Encoder
from .loss import LabelSmoothing
from .utils import unique_2d
# Set logger
logger = logging.getLogger(__name__)
[docs]class ContextBuilder(nn.Module):
[docs] def __init__(self, input_size, output_size, hidden_size=128, num_layers=1,
max_length=10, bidirectional=False, LSTM=False):
"""ContextBuilder that learns to interpret context from security events.
Based on an attention-based Encoder-Decoder architecture.
Parameters
----------
input_size : int
Size of input vocabulary, i.e. possible distinct input items
output_size : int
Size of output vocabulary, i.e. possible distinct output items
hidden_size : int, default=128
Size of hidden layer in sequence to sequence prediction.
This parameter determines the complexity of the model and its
prediction power. However, high values will result in slower
training and prediction times
num_layers : int, default=1
Number of recurrent layers to use
max_length : int, default=10
Maximum lenght of input sequence to expect
bidirectional : boolean, default=False
If True, use a bidirectional encoder and decoder
LSTM : boolean, default=False
If True, use an LSTM as a recurrent unit instead of GRU
"""
logger.info("ContextBuilder.__init__")
# Initialise super
super().__init__()
################################################################
# Initialise layers #
################################################################
# Create embedding
self.embedding = nn.Embedding(input_size, hidden_size)
self.embedding_one_hot = EmbeddingOneHot(input_size)
# Create encoder
self.encoder = Encoder(
embedding = self.embedding_one_hot,
hidden_size = hidden_size,
num_layers = num_layers,
bidirectional = bidirectional,
LSTM = LSTM
)
# Create attention decoder
self.decoder_attention = DecoderAttention(
embedding = self.embedding,
context_size = hidden_size,
attention_size = max_length,
num_layers = num_layers,
dropout = 0.1,
bidirectional = bidirectional,
LSTM = LSTM,
)
# Create event decoder
self.decoder_event = DecoderEvent(
input_size = input_size,
output_size = output_size,
dropout = 0.1,
)
########################################################################
# ContextBuilder Forward #
########################################################################
[docs] def forward(self, X, y=None, steps=1, teach_ratio=0.5):
"""Forwards data through ContextBuilder.
Parameters
----------
X : torch.Tensor of shape=(n_samples, seq_len)
Tensor of input events to forward.
y : torch.Tensor of shape=(n_samples, steps), optional
If given, use value of y as next input with probability
teach_ratio.
steps : int, default=1
Number of steps to predict in the future.
teach_ratio : float, default=0.5
Ratio of sequences to train that use given labels Y.
The remaining part will be trained using the predicted values.
Returns
-------
confidence : torch.Tensor of shape=(n_samples, steps, output_size)
The confidence level of each output event.
attention : torch.Tensor of shape=(n_samples, steps, seq_len)
Attention corrsponding to X given as (batch, out_seq, in_seq).
"""
logger.info("forward {} samples".format(X.shape[0]))
####################################################################
# Perform check on events in X #
####################################################################
if X.max() >= self.embedding_one_hot.input_size:
raise ValueError(
"Expected {} different input events, but received input event "
"'{}' not in expected range 0-{}. Please ensure that the "
"ContextBuilder is configured with the correct input_size and "
"output_size".format(
self.embedding_one_hot.input_size,
X.max(),
self.embedding_one_hot.input_size-1,
))
####################################################################
# Forward data #
####################################################################
# Initialise results
confidence = list()
attention = list()
# Get initial inputs of decoder
decoder_input = torch.zeros(
size = (X.shape[0], 1),
dtype = torch.long,
device = X.device,
)
# Encode input
X_encoded, context_vector = self.encoder(X)
# Loop over all targets
for step in range(steps):
# Compute attention
attention_, context_vector = self.decoder_attention(
context_vector = context_vector,
previous_input = decoder_input,
)
# Compute event probability distribution
confidence_ = self.decoder_event(
X = X_encoded,
attention = attention_,
)
# Store confidence
confidence.append(confidence_)
# Store attention
attention .append(attention_)
# Detatch from history
if y is not None and random.random() <= teach_ratio:
decoder_input = y[:, step]
else:
decoder_input = confidence_.argmax(dim=1).detach().unsqueeze(1)
# Return result
return torch.stack(confidence, dim=1), torch.stack(attention, dim=1)
########################################################################
# Fit/predict methods #
########################################################################
[docs] def fit(self, X, y, epochs=10, batch_size=128, learning_rate=0.01,
optimizer=optim.SGD, teach_ratio=0.5, verbose=True):
"""Fit the sequence predictor with labelled data
Parameters
----------
X : array-like of type=int and shape=(n_samples, context_size)
Input context to train with.
y : array-like of type=int and shape=(n_samples, n_future_events)
Sequences of target events.
epochs : int, default=10
Number of epochs to train with.
batch_size : int, default=128
Batch size to use for training.
learning_rate : float, default=0.01
Learning rate to use for training.
optimizer : optim.Optimizer, default=torch.optim.SGD
Optimizer to use for training.
teach_ratio : float, default=0.5
Ratio of sequences to train including labels.
verbose : boolean, default=True
If True, prints progress.
Returns
-------
self : self
Returns self
"""
logger.info("fit {} samples".format(X.shape[0]))
# Get current mode
mode = self.training
# Get input as torch tensors
device = next(self.parameters()).device
X = torch.as_tensor(X, dtype=torch.int64, device=device)
y = torch.as_tensor(y, dtype=torch.int64, device=device)
# Set to training mode
self.train()
# Set criterion and optimiser
criterion = LabelSmoothing(self.decoder_event.out.out_features, 0.1)
optimizer = optimizer(
params = self.parameters(),
lr = learning_rate
)
# Load dataset
data = DataLoader(TensorDataset(X, y),
batch_size = batch_size,
shuffle = True,
)
# Loop over each epoch
for epoch in range(1, epochs+1):
try:
# Set progress bar if necessary
if verbose:
data = tqdm(data,
desc="[Epoch {:{width}}/{:{width}} loss={:.4f}]"
.format(epoch, epochs, 0, width=len(str(epochs)))
)
# Set average loss
total_loss = 0
total_items = 0
# Loop over entire dataset
for X_, y_ in data:
# Clear gradients
optimizer.zero_grad()
# Get prediction
confidence, _ = self.forward(X_, y_,
steps = y_.shape[1],
teach_ratio = teach_ratio
)
# Compute loss
loss = 0
for step in range(confidence.shape[1]):
loss += criterion(confidence[:, step], y_[:, step])
# Backpropagate
loss.backward()
optimizer.step()
# Update description
total_loss += loss.item() / X_.shape[1]
total_items += X_.shape[0]
if verbose:
data.set_description(
"[Epoch {:{width}}/{:{width}} loss={:.4f}]"
.format(epoch, epochs, total_loss/total_items,
width=len(str(epochs))))
except KeyboardInterrupt as e:
print("\nTraining interrupted, performing clean stop")
break
# Reset to original mode
self.train(mode)
# Return self
return self
[docs] def predict(self, X, y=None, steps=1):
"""Predict the next elements in sequence.
Parameters
----------
X : torch.Tensor
Tensor of input sequences
y : ignored
steps : int, default=1
Number of steps to predict into the future
Returns
-------
confidence : torch.Tensor of shape=(n_samples, seq_len, output_size)
The confidence level of each output
attention : torch.Tensor of shape=(n_samples, input_length)
Attention corrsponding to X given as (batch, out_seq, seq_len)
"""
logger.info("predict {} samples".format(X.shape[0]))
# Get current mode
mode = self.training
# Set to prediction mode
self.eval()
# Memory optimization, only use unique values
X, inverse = torch.unique(X, dim=0, return_inverse=True)
logger.info("predict {}/{} unique samples".format(X.shape[0], inverse.shape[0]))
# Do not perform gradient descent
with torch.no_grad():
# Perform all in single batch
confidence, attention = self.forward(X, steps=steps)
# Reset to original mode
self.train(mode)
# Return result
return confidence[inverse], attention[inverse]
[docs] def fit_predict(self, X, y, epochs=10, batch_size=128, learning_rate=0.01,
optimizer=optim.SGD, teach_ratio=0.5, verbose=True):
"""Fit the sequence predictor with labelled data
Parameters
----------
X : torch.Tensor
Tensor of input sequences
y : torch.Tensor
Tensor of output sequences
epochs : int, default=10
Number of epochs to train with
batch_size : int, default=128
Batch size to use for training
learning_rate : float, default=0.01
Learning rate to use for training
optimizer : optim.Optimizer, default=torch.optim.SGD
Optimizer to use for training
teach_ratio : float, default=0.5
Ratio of sequences to train including labels
verbose : boolean, default=True
If True, prints progress
Returns
-------
result : torch.Tensor
Predictions corresponding to X
"""
logger.info("fit_predict {} samples".format(X.shape[0]))
# Apply fit and predict in sequence
return self.fit(
X = X,
y = y,
epochs = epochs,
batch_size = batch_size,
learning_rate = learning_rate,
optimizer = optimizer,
teach_ratio = teach_ratio,
verbose = verbose,
).predict(X)
########################################################################
# ContextBuilder Query #
########################################################################
[docs] def query(self, X, y, iterations=0, batch_size=1024, ignore=None,
return_optimization=None, verbose=True):
"""Query the network to get optimal attention vector.
Parameters
----------
X : array-like of type=int and shape=(n_samples, context_size)
Input context of events, same as input to fit and predict
y : array-like of type=int and shape=(n_samples,)
Observed event
iterations : int, default=0
Number of iterations to perform for optimization of actual event
batch_size : int, default=1024
Batch size of items to optimize
ignore : int, optional
If given ignore this index as attention
return_optimization : float, optional
If given, returns number of items with confidence level larger
than given parameter. E.g. return_optimization=0.2 will also
return two boolean tensors for elements with a confidence >= 0.2
before optimization and after optimization.
verbose : boolean, default=True
If True, print progress
Returns
-------
confidence : torch.Tensor of shape=(n_samples, output_size)
Confidence of each prediction given new attention
attention : torch.Tensor of shape=(n_samples, context_size)
Importance of each input with respect to output
inverse : torch.Tensor of shape=(n_samples,)
Inverse is returned to reconstruct the original array
confidence_orig : torch.Tensor of shape=(n_samples,)
Only returned if return_optimization != None
Boolean array of items >= threshold before optimization
confidence_optim : torch.Tensor of shape=(n_samples,)
Only returned if return_optimization != None
Boolean array of items >= threshold after optimization
"""
# Get device
original_device = X.device
# Initialise result
result_confidence = list()
result_attention = list()
# Memory optimization, only use unique values
X, y, inverse = unique_2d(X, y)
# Ignore given datapoints
if ignore is not None:
raise NotImplementedError("Ignore is not properly implemented yet.")
attention[X == ignore] = 0
# Squeeze variables
y = y.squeeze(1)
# Initialise progress if necessary
if verbose:
progress = tqdm(None,
total = int(iterations)*int(math.ceil(X.shape[0]/batch_size)),
desc = "Optimizing query",
)
# Batch data
batches = DataLoader(
TensorDataset(X, y),
batch_size = batch_size,
shuffle = False,
)
# Count datapoints with confidence >= 0.2
if return_optimization is not None:
confidence_orig = list()
confidence_optim = list()
################################################################
# Attention optimisation #
################################################################
# Loop over batches
for batch, (X_, y_) in enumerate(batches):
# Compute initial attention and confidence
confidence, attention = self.predict(X_, y_)
confidence = confidence.squeeze(1)
attention = attention .squeeze(1)
# Count confidence >= 0.2 of non-optimized datapoints
if return_optimization is not None:
confidence_orig.append((
confidence[torch.arange(y_.shape[0]), y_].exp() >= return_optimization
).detach().clone())
# Make attention variable
attn = Variable(attention.detach().clone(), requires_grad=True)
# Set optimizer
optimizer = optim.Adam([attn], lr=0.1)
criterion = nn.NLLLoss()
# Encode values of X
with torch.no_grad():
X_, _ = self.encoder(X_)
# Perform iterations
for iteration in range(int(iterations)):
# Clear optimizer
optimizer.zero_grad()
# Add decoding function
def decode(input, attn, softmax=False):
if softmax: attn = F.softmax(attn, dim=1)
return self.decoder_event(input, attn)
# Perform prediction
pred = decode(X_, attn, softmax=iteration > 0)
# Compute loss
loss = criterion(pred, y_)
# Perform backpropagation
loss.backward()
optimizer.step()
# Update progress if necessary
if verbose: progress.update()
# Perform final softmax
if iterations > 0: attn = F.softmax(attn, dim=1)
# Detach attention - memory optimization
attn = attn.detach()
# Get confidence levels
confidence_ = self.decoder_event(X_, attn)
confidence_ = confidence_[torch.arange(y_.shape[0]), y_].exp().detach()
confidence = confidence [torch.arange(y_.shape[0]), y_].exp().detach()
# Check where confidence improved
mask = confidence_ > confidence
# Store attention if we improved
attention[mask] = attn[mask]
# Recompute confidence
with torch.no_grad():
confidence = self.decoder_event(
X = X_,
attention = attention,
).exp()
# Count confidence >= 0.2 of optimized datapoints
if return_optimization is not None:
confidence_optim.append((
confidence[torch.arange(y_.shape[0]), y_] >= return_optimization
).detach().clone())
# Add confidence and attention to result
result_confidence.append(confidence.cpu())
result_attention .append(attention .cpu())
# Combine confidence and attention into tensor
# and cast to original device
confidence = torch.cat(result_confidence).to(original_device)
attention = torch.cat(result_attention) .to(original_device)
# Close progress if necessary
if verbose: progress.close()
# Return confidence optimization if necessary
if return_optimization is not None:
confidence_orig = torch.cat(confidence_orig )
confidence_optim = torch.cat(confidence_optim)
# Return result
return confidence, attention, inverse, confidence_orig, confidence_optim
# Return result
return confidence, attention, inverse
########################################################################
# Save/load model #
########################################################################
[docs] def save(self, outfile):
"""Save model to output file.
Parameters
----------
outfile : string
File to output model.
"""
# Save to output file
torch.save(self.state_dict(), outfile)
[docs] @classmethod
def load(cls, infile, device=None):
"""Load model from input file.
Parameters
----------
infile : string
File from which to load model.
"""
# Load state dictionary
state_dict = torch.load(infile, map_location=device)
# Get input variables from state_dict
input_size = state_dict.get('embedding.weight').shape[0]
output_size = state_dict.get('decoder_event.out.weight').shape[0]
hidden_size = state_dict.get('embedding.weight').shape[1]
num_layers = 1 # TODO
max_length = state_dict.get('decoder_attention.attn.weight').shape[0]
bidirectional = state_dict.get('decoder_attention.attn.weight').shape[1] // hidden_size != num_layers
LSTM = False # TODO
# Create ContextBuilder
result = cls(
input_size = input_size,
output_size = output_size,
hidden_size = hidden_size,
num_layers = num_layers,
max_length = max_length,
bidirectional = bidirectional,
LSTM = LSTM,
)
# Cast to device if necessary
if device is not None: result = result.to(device)
# Set trained parameters
result.load_state_dict(state_dict)
# Return result
return result