Source code for context_builder.context_builder

# Imports
import logging
import math
import random
from tqdm import tqdm

# Torch imports
import torch
import torch.nn            as nn
import torch.nn.functional as F
import torch.optim         as optim
from torch.autograd   import Variable
from torch.utils.data import DataLoader, TensorDataset

# Custom package imports
from .decoders  import DecoderAttention, DecoderEvent
from .embedding import EmbeddingOneHot
from .encoders  import Encoder
from .loss      import LabelSmoothing
from .utils     import unique_2d

# Set logger
logger = logging.getLogger(__name__)

[docs]class ContextBuilder(nn.Module):

[docs]    def __init__(self, input_size, output_size, hidden_size=128, num_layers=1,
                 max_length=10, bidirectional=False, LSTM=False):
        """ContextBuilder that learns to interpret context from security events.
            Based on an attention-based Encoder-Decoder architecture.

            Parameters
            ----------
            input_size : int
                Size of input vocabulary, i.e. possible distinct input items

            output_size : int
                Size of output vocabulary, i.e. possible distinct output items

            hidden_size : int, default=128
                Size of hidden layer in sequence to sequence prediction.
                This parameter determines the complexity of the model and its
                prediction power. However, high values will result in slower
                training and prediction times

            num_layers : int, default=1
                Number of recurrent layers to use

            max_length : int, default=10
                Maximum lenght of input sequence to expect

            bidirectional : boolean, default=False
                If True, use a bidirectional encoder and decoder

            LSTM : boolean, default=False
                If True, use an LSTM as a recurrent unit instead of GRU
            """
        logger.info("ContextBuilder.__init__")

        # Initialise super
        super().__init__()

        ################################################################
        #                      Initialise layers                       #
        ################################################################

        # Create embedding
        self.embedding         = nn.Embedding(input_size, hidden_size)
        self.embedding_one_hot = EmbeddingOneHot(input_size)

        # Create encoder
        self.encoder = Encoder(
            embedding     = self.embedding_one_hot,
            hidden_size   = hidden_size,
            num_layers    = num_layers,
            bidirectional = bidirectional,
            LSTM          = LSTM
        )

        # Create attention decoder
        self.decoder_attention = DecoderAttention(
            embedding      = self.embedding,
            context_size   = hidden_size,
            attention_size = max_length,
            num_layers     = num_layers,
            dropout        = 0.1,
            bidirectional  = bidirectional,
            LSTM           = LSTM,
        )

        # Create event decoder
        self.decoder_event = DecoderEvent(
            input_size  = input_size,
            output_size = output_size,
            dropout     = 0.1,
        )

    ########################################################################
    #                        ContextBuilder Forward                        #
    ########################################################################

[docs]    def forward(self, X, y=None, steps=1, teach_ratio=0.5):
        """Forwards data through ContextBuilder.

            Parameters
            ----------
            X : torch.Tensor of shape=(n_samples, seq_len)
                Tensor of input events to forward.

            y : torch.Tensor of shape=(n_samples, steps), optional
                If given, use value of y as next input with probability
                teach_ratio.

            steps : int, default=1
                Number of steps to predict in the future.

            teach_ratio : float, default=0.5
                Ratio of sequences to train that use given labels Y.
                The remaining part will be trained using the predicted values.

            Returns
            -------
            confidence : torch.Tensor of shape=(n_samples, steps, output_size)
                The confidence level of each output event.

            attention : torch.Tensor of shape=(n_samples, steps, seq_len)
                Attention corrsponding to X given as (batch, out_seq, in_seq).
            """
        logger.info("forward {} samples".format(X.shape[0]))

        ####################################################################
        #                   Perform check on events in X                   #
        ####################################################################

        if X.max() >= self.embedding_one_hot.input_size:
            raise ValueError(
                "Expected {} different input events, but received input event "
                "'{}' not in expected range 0-{}. Please ensure that the "
                "ContextBuilder is configured with the correct input_size and "
                "output_size".format(
                self.embedding_one_hot.input_size,
                X.max(),
                self.embedding_one_hot.input_size-1,
            ))

        ####################################################################
        #                           Forward data                           #
        ####################################################################


        # Initialise results
        confidence = list()
        attention  = list()

        # Get initial inputs of decoder
        decoder_input  = torch.zeros(
            size       = (X.shape[0], 1),
            dtype      = torch.long,
            device     = X.device,
        )

        # Encode input
        X_encoded, context_vector = self.encoder(X)

        # Loop over all targets
        for step in range(steps):
            # Compute attention
            attention_, context_vector = self.decoder_attention(
                context_vector = context_vector,
                previous_input = decoder_input,
            )

            # Compute event probability distribution
            confidence_ = self.decoder_event(
                X         = X_encoded,
                attention = attention_,
            )

            # Store confidence
            confidence.append(confidence_)
            # Store attention
            attention .append(attention_)

            # Detatch from history
            if y is not None and random.random() <= teach_ratio:
                decoder_input = y[:, step]
            else:
                decoder_input = confidence_.argmax(dim=1).detach().unsqueeze(1)

        # Return result
        return torch.stack(confidence, dim=1), torch.stack(attention, dim=1)


    ########################################################################
    #                         Fit/predict methods                          #
    ########################################################################

[docs]    def fit(self, X, y, epochs=10, batch_size=128, learning_rate=0.01,
            optimizer=optim.SGD, teach_ratio=0.5, verbose=True):
        """Fit the sequence predictor with labelled data

            Parameters
            ----------
            X : array-like of type=int and shape=(n_samples, context_size)
                Input context to train with.

            y : array-like of type=int and shape=(n_samples, n_future_events)
                Sequences of target events.

            epochs : int, default=10
                Number of epochs to train with.

            batch_size : int, default=128
                Batch size to use for training.

            learning_rate : float, default=0.01
                Learning rate to use for training.

            optimizer : optim.Optimizer, default=torch.optim.SGD
                Optimizer to use for training.

            teach_ratio : float, default=0.5
                Ratio of sequences to train including labels.

            verbose : boolean, default=True
                If True, prints progress.

            Returns
            -------
            self : self
                Returns self
            """
        logger.info("fit {} samples".format(X.shape[0]))

        # Get current mode
        mode = self.training
        # Get input as torch tensors
        device = next(self.parameters()).device
        X = torch.as_tensor(X, dtype=torch.int64, device=device)
        y = torch.as_tensor(y, dtype=torch.int64, device=device)

        # Set to training mode
        self.train()

        # Set criterion and optimiser
        criterion = LabelSmoothing(self.decoder_event.out.out_features, 0.1)
        optimizer = optimizer(
            params = self.parameters(),
            lr     = learning_rate
        )

        # Load dataset
        data = DataLoader(TensorDataset(X, y),
            batch_size = batch_size,
            shuffle    = True,
        )

        # Loop over each epoch
        for epoch in range(1, epochs+1):
            try:
                # Set progress bar if necessary
                if verbose:
                    data = tqdm(data,
                        desc="[Epoch {:{width}}/{:{width}} loss={:.4f}]"
                        .format(epoch, epochs, 0, width=len(str(epochs)))
                    )

                # Set average loss
                total_loss  = 0
                total_items = 0

                # Loop over entire dataset
                for X_, y_ in data:
                    # Clear gradients
                    optimizer.zero_grad()

                    # Get prediction
                    confidence, _ = self.forward(X_, y_,
                        steps       = y_.shape[1],
                        teach_ratio = teach_ratio
                    )

                    # Compute loss
                    loss = 0
                    for step in range(confidence.shape[1]):
                        loss += criterion(confidence[:, step], y_[:, step])

                    # Backpropagate
                    loss.backward()
                    optimizer.step()

                    # Update description
                    total_loss  += loss.item() / X_.shape[1]
                    total_items += X_.shape[0]

                    if verbose:
                        data.set_description(
                            "[Epoch {:{width}}/{:{width}} loss={:.4f}]"
                            .format(epoch, epochs, total_loss/total_items,
                            width=len(str(epochs))))

            except KeyboardInterrupt as e:
                print("\nTraining interrupted, performing clean stop")
                break

        # Reset to original mode
        self.train(mode)

        # Return self
        return self


[docs]    def predict(self, X, y=None, steps=1):
        """Predict the next elements in sequence.

            Parameters
            ----------
            X : torch.Tensor
                Tensor of input sequences

            y : ignored

            steps : int, default=1
                Number of steps to predict into the future

            Returns
            -------
            confidence : torch.Tensor of shape=(n_samples, seq_len, output_size)
                The confidence level of each output

            attention : torch.Tensor of shape=(n_samples, input_length)
                Attention corrsponding to X given as (batch, out_seq, seq_len)
            """
        logger.info("predict {} samples".format(X.shape[0]))

        # Get current mode
        mode = self.training
        # Set to prediction mode
        self.eval()

        # Memory optimization, only use unique values
        X, inverse = torch.unique(X, dim=0, return_inverse=True)

        logger.info("predict {}/{} unique samples".format(X.shape[0], inverse.shape[0]))

        # Do not perform gradient descent
        with torch.no_grad():
            # Perform all in single batch
            confidence, attention = self.forward(X, steps=steps)

        # Reset to original mode
        self.train(mode)

        # Return result
        return confidence[inverse], attention[inverse]


[docs]    def fit_predict(self, X, y, epochs=10, batch_size=128, learning_rate=0.01,
                    optimizer=optim.SGD, teach_ratio=0.5, verbose=True):
        """Fit the sequence predictor with labelled data

            Parameters
            ----------
            X : torch.Tensor
                Tensor of input sequences

            y : torch.Tensor
                Tensor of output sequences

            epochs : int, default=10
                Number of epochs to train with

            batch_size : int, default=128
                Batch size to use for training

            learning_rate : float, default=0.01
                Learning rate to use for training

            optimizer : optim.Optimizer, default=torch.optim.SGD
                Optimizer to use for training

            teach_ratio : float, default=0.5
                Ratio of sequences to train including labels

            verbose : boolean, default=True
                If True, prints progress

            Returns
            -------
            result : torch.Tensor
                Predictions corresponding to X
            """
        logger.info("fit_predict {} samples".format(X.shape[0]))

        # Apply fit and predict in sequence
        return self.fit(
            X             = X,
            y             = y,
            epochs        = epochs,
            batch_size    = batch_size,
            learning_rate = learning_rate,
            optimizer     = optimizer,
            teach_ratio   = teach_ratio,
            verbose       = verbose,
        ).predict(X)

    ########################################################################
    #                         ContextBuilder Query                         #
    ########################################################################

[docs]    def query(self, X, y, iterations=0, batch_size=1024, ignore=None,
              return_optimization=None, verbose=True):
        """Query the network to get optimal attention vector.

            Parameters
            ----------
            X : array-like of type=int and shape=(n_samples, context_size)
                Input context of events, same as input to fit and predict

            y : array-like of type=int and shape=(n_samples,)
                Observed event

            iterations : int, default=0
                Number of iterations to perform for optimization of actual event

            batch_size : int, default=1024
                Batch size of items to optimize

            ignore : int, optional
                If given ignore this index as attention

            return_optimization : float, optional
                If given, returns number of items with confidence level larger
                than given parameter. E.g. return_optimization=0.2 will also
                return two boolean tensors for elements with a confidence >= 0.2
                before optimization and after optimization.

            verbose : boolean, default=True
                If True, print progress

            Returns
            -------
            confidence : torch.Tensor of shape=(n_samples, output_size)
                Confidence of each prediction given new attention

            attention : torch.Tensor of shape=(n_samples, context_size)
                Importance of each input with respect to output

            inverse : torch.Tensor of shape=(n_samples,)
                Inverse is returned to reconstruct the original array

            confidence_orig : torch.Tensor of shape=(n_samples,)
                Only returned if return_optimization != None
                Boolean array of items >= threshold before optimization

            confidence_optim : torch.Tensor of shape=(n_samples,)
                Only returned if return_optimization != None
                Boolean array of items >= threshold after optimization
            """
        # Get device
        original_device = X.device

        # Initialise result
        result_confidence = list()
        result_attention  = list()

        # Memory optimization, only use unique values
        X, y, inverse = unique_2d(X, y)

        # Ignore given datapoints
        if ignore is not None:
            raise NotImplementedError("Ignore is not properly implemented yet.")
            attention[X == ignore] = 0

        # Squeeze variables
        y = y.squeeze(1)

        # Initialise progress if necessary
        if verbose:
            progress = tqdm(None,
                total = int(iterations)*int(math.ceil(X.shape[0]/batch_size)),
                desc  = "Optimizing query",
            )

        # Batch data
        batches = DataLoader(
            TensorDataset(X, y),
            batch_size = batch_size,
            shuffle    = False,
        )

        # Count datapoints with confidence >= 0.2
        if return_optimization is not None:
            confidence_orig  = list()
            confidence_optim = list()

        ################################################################
        #                    Attention optimisation                    #
        ################################################################

        # Loop over batches
        for batch, (X_, y_) in enumerate(batches):
            # Compute initial attention and confidence
            confidence, attention = self.predict(X_, y_)
            confidence = confidence.squeeze(1)
            attention  = attention .squeeze(1)

            # Count confidence >= 0.2 of non-optimized datapoints
            if return_optimization is not None:
                confidence_orig.append((
                    confidence[torch.arange(y_.shape[0]), y_].exp() >= return_optimization
                ).detach().clone())

            # Make attention variable
            attn = Variable(attention.detach().clone(), requires_grad=True)
            # Set optimizer
            optimizer = optim.Adam([attn], lr=0.1)
            criterion = nn.NLLLoss()

            # Encode values of X
            with torch.no_grad():
                X_, _ = self.encoder(X_)

            # Perform iterations
            for iteration in range(int(iterations)):
                # Clear optimizer
                optimizer.zero_grad()

                # Add decoding function
                def decode(input, attn, softmax=False):
                    if softmax: attn = F.softmax(attn, dim=1)
                    return self.decoder_event(input, attn)

                # Perform prediction
                pred = decode(X_, attn, softmax=iteration > 0)

                # Compute loss
                loss = criterion(pred, y_)

                # Perform backpropagation
                loss.backward()
                optimizer.step()

                # Update progress if necessary
                if verbose: progress.update()

            # Perform final softmax
            if iterations > 0: attn = F.softmax(attn, dim=1)

            # Detach attention - memory optimization
            attn = attn.detach()

            # Get confidence levels
            confidence_ = self.decoder_event(X_, attn)
            confidence_ = confidence_[torch.arange(y_.shape[0]), y_].exp().detach()
            confidence  = confidence [torch.arange(y_.shape[0]), y_].exp().detach()

            # Check where confidence improved
            mask = confidence_ > confidence

            # Store attention if we improved
            attention[mask] = attn[mask]

            # Recompute confidence
            with torch.no_grad():
                confidence = self.decoder_event(
                    X         = X_,
                    attention = attention,
                ).exp()

                # Count confidence >= 0.2 of optimized datapoints
                if return_optimization is not None:
                    confidence_optim.append((
                        confidence[torch.arange(y_.shape[0]), y_] >= return_optimization
                    ).detach().clone())

            # Add confidence and attention to result
            result_confidence.append(confidence.cpu())
            result_attention .append(attention .cpu())

        # Combine confidence and attention into tensor
        # and cast to original device
        confidence = torch.cat(result_confidence).to(original_device)
        attention  = torch.cat(result_attention) .to(original_device)

        # Close progress if necessary
        if verbose: progress.close()

        # Return confidence optimization if necessary
        if return_optimization is not None:
            confidence_orig  = torch.cat(confidence_orig )
            confidence_optim = torch.cat(confidence_optim)
            # Return result
            return confidence, attention, inverse, confidence_orig, confidence_optim

        # Return result
        return confidence, attention, inverse


    ########################################################################
    #                           Save/load model                            #
    ########################################################################

[docs]    def save(self, outfile):
        """Save model to output file.

            Parameters
            ----------
            outfile : string
                File to output model.
            """
        # Save to output file
        torch.save(self.state_dict(), outfile)

[docs]    @classmethod
    def load(cls, infile, device=None):
        """Load model from input file.

            Parameters
            ----------
            infile : string
                File from which to load model.
            """
        # Load state dictionary
        state_dict = torch.load(infile, map_location=device)

        # Get input variables from state_dict
        input_size    = state_dict.get('embedding.weight').shape[0]
        output_size   = state_dict.get('decoder_event.out.weight').shape[0]
        hidden_size   = state_dict.get('embedding.weight').shape[1]
        num_layers    = 1 # TODO
        max_length    = state_dict.get('decoder_attention.attn.weight').shape[0]
        bidirectional = state_dict.get('decoder_attention.attn.weight').shape[1] // hidden_size != num_layers
        LSTM          = False # TODO

        # Create ContextBuilder
        result = cls(
            input_size    = input_size,
            output_size   = output_size,
            hidden_size   = hidden_size,
            num_layers    = num_layers,
            max_length    = max_length,
            bidirectional = bidirectional,
            LSTM          = LSTM,
        )

        # Cast to device if necessary
        if device is not None: result = result.to(device)

        # Set trained parameters
        result.load_state_dict(state_dict)

        # Return result
        return result