Source code for fresco.models.mtcnn

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F


[docs]class MTCNN(nn.Module): ''' Multitask simple text CNN for classifying cancer pathology reports. Args: embedding_matrix (numpy.array): Numpy array of word embeddings. Each row should represent a word embedding. NOTE: The word index 0 is masked, so the first row is ignored. num_classes (list[int]): Number of possible output classes for each task. window_sizes (list[int], default: [3, 4, 5]): Window size (consecutive tokens examined) in parallel convolution layers. Must match the length of num_filters. num_filters (list[int], default: [300, 300, 300]): Number of filters used in parallel convolution layers. Must match the length of window_sizes. dropout (float, default: 0.5): Dropout rate applied to the final document embedding after maxpooling. bag_of_embeddings (bool, default: False): Adds a parallel bag of embeddings layer and concatenates it to the final document embedding. embeddings_scale (float, default: 2.5): Scaling of word embeddings matrix columns. Returns: None ''' def __init__(self, embedding_matrix, num_classes, window_sizes=None, num_filters=None, dropout=0.5, bag_of_embeddings=True, embeddings_scale=20 ): if window_sizes is None: window_sizes = [3, 4, 5] if num_filters is None: num_filters = [300, 300, 300] super().__init__() # normalize and initialize embeddings embedding_matrix -= embedding_matrix.mean(axis=0) embedding_matrix /= (embedding_matrix.std(axis=0, ddof=1) * embeddings_scale) embedding_matrix[0] = 0 self.embedding = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix,dtype=torch.float), freeze=False, padding_idx=0) # parallel convolution layers self.conv_layers = nn.ModuleList() for s,f in zip(window_sizes,num_filters): l = nn.Conv1d(embedding_matrix.shape[1],f,s) torch.nn.init.xavier_uniform_(l.weight) l.bias.data.fill_(0.01) self.conv_layers.append(l) self.drop_layer = nn.Dropout(dropout) # optional bag of embeddings layers self.boe = bag_of_embeddings if self.boe: self.boe_dense = nn.Linear(embedding_matrix.shape[1],embedding_matrix.shape[1]) torch.nn.init.xavier_uniform_(self.boe_dense.weight) self.boe_dense.bias.data.fill_(0.0) # dense classification layers self.classify_layers = nn.ModuleList() for n in num_classes: in_size = np.sum(num_filters) if self.boe: in_size += embedding_matrix.shape[1] l = nn.Linear(in_size,n) torch.nn.init.xavier_uniform_(l.weight) l.bias.data.fill_(0.0) self.classify_layers.append(l)
[docs] def forward(self, docs: torch.tensor, return_embeds: bool=False) -> list: ''' MT-CNN forward pass. Args: docs (torch.tensor): Batch of documents to classify. Each document should be a 0-padded row of mapped word indices. Returns: list[torch.tensor]: List of predicted logits for each task. ''' # generate masks for word padding # remove extra padding that exists across all documents in batch mask_words = (docs != 0) words_per_line = mask_words.sum(-1) max_words = words_per_line.max() max_words = max(max_words, 5) mask_words = torch.unsqueeze(mask_words[:,:max_words],-1) docs_input_reduced = docs[:,:max_words] # word embeddings word_embeds = self.embedding(docs_input_reduced) word_embeds = torch.mul(word_embeds,mask_words.type(word_embeds.dtype)) word_embeds = word_embeds.permute(0,2,1) # parallel 1D word convolutions conv_outs = [] for l in self.conv_layers: conv_out = F.relu(l(word_embeds)) conv_outs.append(torch.max(conv_out,2)[0]) concat = torch.cat(conv_outs,1) # bag of embeddings operations if enabled if self.boe: bag_embeds = torch.sum(word_embeds,-1) bag_embeds = torch.mul(bag_embeds, 1/torch.unsqueeze(words_per_line,-1).type(bag_embeds.dtype)) bag_embeds = torch.tanh(self.boe_dense(bag_embeds)) concat = torch.cat([concat,bag_embeds],1) # generate logits for each task doc_embeds = self.drop_layer(concat) logits = [] for l in self.classify_layers: logits.append(l(doc_embeds)) if return_embeds: return logits,doc_embeds return logits