PyTorch 0.4.1 support (#25)

kmkurn · web-flow · commit 2c79f9a4f5ee · 2018-12-01T11:10:54.000+07:00
* Closes #15 * Make tests pass for PyTorch 0.4 * Get rid of _new; use tensor.new_* and torch.*_like instead * Get rid of obsolete tensor creation methods * Get rid of warnings * Use PyTorch's logsumexp * Fix type annotations and docstrings * Update README
diff --git a/.travis.yml b/.travis.yml
@@ -1,17 +1,12 @@
 language: python
-env:
-  - PYTORCH_VERSION=0.3.0
-  - PYTORCH_VERSION=0.3.1
 before_install:
   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
   - bash ~/miniconda.sh -b -p $HOME/miniconda
   - export PATH="$HOME/miniconda/bin:$PATH"
   - conda update -y conda
   - conda create -y -n pytorch-crf python=3.6
   - source activate pytorch-crf
-  - conda install -y -c pytorch pytorch=$PYTORCH_VERSION
-  # explicitly specify cudatoolkit version (see https://discuss.pytorch.org/t/libcudart-so-8-0-not-found-in-travis-ci/16071/3)
-  - if [[ $PYTORCH_VERSION == "0.3.0" ]]; then conda install -y -c pytorch cudatoolkit=8.0; fi
+  - conda install -y -c pytorch pytorch=0.4.1
   - export LD_LIBRARY_PATH="$HOME/miniconda/envs/pytorch-crf/lib:$LD_LIBRARY_PATH"
 install:
   - pip install --ignore-installed -r requirements.txt
diff --git a/README.rst b/README.rst
@@ -24,7 +24,7 @@ Requirements
 ============
 
 - Python 3.6
-- PyTorch 0.3.0
+- PyTorch 0.4.1
 
 Installation
 ============
@@ -47,8 +47,8 @@ In the examples below, we will assume that these lines have been executed
     >>> import torch
     >>> from torchcrf import CRF
     >>> seq_length, batch_size, num_tags = 3, 2, 5
-    >>> emissions = torch.autograd.Variable(torch.randn(seq_length, batch_size, num_tags), requires_grad=True)
-    >>> tags = torch.autograd.Variable(torch.LongTensor([[0, 1], [2, 4], [3, 1]]))  # (seq_length, batch_size)
+    >>> emissions = torch.randn(seq_length, batch_size, num_tags)
+    >>> tags = torch.tensor([[0, 1], [2, 4], [3, 1]], dtype=torch.long)  # (seq_length, batch_size)
     >>> model = CRF(num_tags)
 
 Computing log likelihood
@@ -57,20 +57,16 @@ Computing log likelihood
 .. code-block:: python
 
     >>> model(emissions, tags)
-    Variable containing:
-    -10.0635
-    [torch.FloatTensor of size 1]
+    tensor(-12.7431, grad_fn=<SumBackward0>)
 
 Computing log likelihood with mask
 ----------------------------------
 
 .. code-block:: python
 
-    >>> mask = torch.autograd.Variable(torch.ByteTensor([[1, 1], [1, 1], [1, 0]]))  # (seq_length, batch_size)
+    >>> mask = torch.tensor([[1, 1], [1, 1], [1, 0]], dtype=torch.uint8)  # (seq_length, batch_size)
     >>> model(emissions, tags, mask=mask)
-    Variable containing:
-    -8.4981
-    [torch.FloatTensor of size 1]
+    tensor(-10.8390, grad_fn=<SumBackward0>)
 
 Decoding
 --------
diff --git a/src/torchcrf/__init__.py b/src/torchcrf/__init__.py
@@ -1,6 +1,5 @@
-from typing import List, Optional, Union
+from typing import List, Optional
 
-from torch.autograd import Variable
 import torch
 import torch.nn as nn
 
@@ -20,8 +19,6 @@ class CRF(nn.Module):
 
     Attributes
     ----------
-    num_tags : int
-        Number of tags passed to ``__init__``.
     start_transitions : :class:`~torch.nn.Parameter`
         Start transition score tensor of size ``(num_tags,)``.
     end_transitions : :class:`~torch.nn.Parameter`
@@ -43,9 +40,9 @@ def __init__(self, num_tags: int) -> None:
             raise ValueError(f'invalid number of tags: {num_tags}')
         super().__init__()
         self.num_tags = num_tags
-        self.start_transitions = nn.Parameter(torch.Tensor(num_tags))
-        self.end_transitions = nn.Parameter(torch.Tensor(num_tags))
-        self.transitions = nn.Parameter(torch.Tensor(num_tags, num_tags))
+        self.start_transitions = nn.Parameter(torch.empty(num_tags))
+        self.end_transitions = nn.Parameter(torch.empty(num_tags))
+        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
 
         self.reset_parameters()
 
@@ -55,35 +52,35 @@ def reset_parameters(self) -> None:
         The parameters will be initialized randomly from a uniform distribution
         between -0.1 and 0.1.
         """
-        nn.init.uniform(self.start_transitions, -0.1, 0.1)
-        nn.init.uniform(self.end_transitions, -0.1, 0.1)
-        nn.init.uniform(self.transitions, -0.1, 0.1)
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(num_tags={self.num_tags})'
 
     def forward(self,
-                emissions: Variable,
-                tags: Variable,
-                mask: Optional[Variable] = None,
+                emissions: torch.Tensor,
+                tags: torch.LongTensor,
+                mask: Optional[torch.ByteTensor] = None,
                 reduce: bool = True,
-                ) -> Variable:
+                ) -> torch.Tensor:
         """Compute the log likelihood of the given sequence of tags and emission score.
 
         Arguments
         ---------
-        emissions : :class:`~torch.autograd.Variable`
+        emissions : :class:`~torch.Tensor`
             Emission score tensor of size ``(seq_length, batch_size, num_tags)``.
-        tags : :class:`~torch.autograd.Variable`
-            Sequence of tags as ``LongTensor`` of size ``(seq_length, batch_size)``.
-        mask : :class:`~torch.autograd.Variable`, optional
-            Mask tensor as ``ByteTensor`` of size ``(seq_length, batch_size)``.
+        tags : :class:`~torch.LongTensor`
+            Sequence of tags of size ``(seq_length, batch_size)``.
+        mask : :class:`~torch.ByteTensor`, optional
+            Mask tensor of size ``(seq_length, batch_size)``.
         reduce : bool
             Whether to sum the log likelihood over the batch.
 
         Returns
         -------
-        :class:`~torch.autograd.Variable`
+        :class:`~torch.Tensor`
             The log likelihood. This will have size (1,) if ``reduce=True``, ``(batch_size,)``
             otherwise.
         """
@@ -107,32 +104,32 @@ def forward(self,
                     f'size of tags and mask must match, got {tuple(tags.size())} '
                     f'and {tuple(mask.size())}'
                 )
-            if not all(mask[0].data):
+            if not all(mask[0]):
                 raise ValueError('mask of the first timestep must all be on')
 
         if mask is None:
-            mask = Variable(self._new(tags.size()).fill_(1)).byte()
+            mask = torch.ones_like(tags, dtype=torch.uint8)
 
         numerator = self._compute_joint_llh(emissions, tags, mask)
         denominator = self._compute_log_partition_function(emissions, mask)
         llh = numerator - denominator
         return llh if not reduce else torch.sum(llh)
 
     def decode(self,
-               emissions: Union[Variable, torch.FloatTensor],
-               mask: Optional[Union[Variable, torch.ByteTensor]] = None) -> List[List[int]]:
+               emissions: torch.Tensor,
+               mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
         """Find the most likely tag sequence using Viterbi algorithm.
 
         Arguments
         ---------
-        emissions : :class:`~torch.autograd.Variable` or :class:`~torch.FloatTensor`
+        emissions : :class:`~torch.Tensor`
             Emission score tensor of size ``(seq_length, batch_size, num_tags)``.
-        mask : :class:`~torch.autograd.Variable` or :class:`torch.ByteTensor`
+        mask : :class:`~torch.ByteTensor`
             Mask tensor of size ``(seq_length, batch_size)``.
 
         Returns
         -------
-        list
+        List[List[int]]
             List of list containing the best tag sequence for each batch.
         """
         if emissions.dim() != 3:
@@ -148,27 +145,23 @@ def decode(self,
                 f'got {tuple(emissions.size()[:2])} and {tuple(mask.size())}'
             )
 
-        if isinstance(emissions, Variable):
-            emissions = emissions.data
         if mask is None:
-            mask = self._new(emissions.size()[:2]).fill_(1).byte()
-        elif isinstance(mask, Variable):
-            mask = mask.data
+            mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
 
         return self._viterbi_decode(emissions, mask)
 
     def _compute_joint_llh(self,
-                           emissions: Variable,
-                           tags: Variable,
-                           mask: Variable) -> Variable:
+                           emissions: torch.Tensor,
+                           tags: torch.LongTensor,
+                           mask: torch.ByteTensor) -> torch.Tensor:
         # emissions: (seq_length, batch_size, num_tags)
         # tags: (seq_length, batch_size)
         # mask: (seq_length, batch_size)
         assert emissions.dim() == 3 and tags.dim() == 2
         assert emissions.size()[:2] == tags.size()
         assert emissions.size(2) == self.num_tags
         assert mask.size() == tags.size()
-        assert all(mask[0].data)
+        assert all(mask[0])
 
         seq_length = emissions.size(0)
         mask = mask.float()
@@ -197,14 +190,14 @@ def _compute_joint_llh(self,
         return llh
 
     def _compute_log_partition_function(self,
-                                        emissions: Variable,
-                                        mask: Variable) -> Variable:
+                                        emissions: torch.Tensor,
+                                        mask: torch.ByteTensor) -> torch.Tensor:
         # emissions: (seq_length, batch_size, num_tags)
         # mask: (seq_length, batch_size)
         assert emissions.dim() == 3 and mask.dim() == 2
         assert emissions.size()[:2] == mask.size()
         assert emissions.size(2) == self.num_tags
-        assert all(mask[0].data)
+        assert all(mask[0])
 
         seq_length = emissions.size(0)
         mask = mask.float()
@@ -226,15 +219,15 @@ def _compute_log_partition_function(self,
                 + broadcast_emissions  # (batch_size, num_tags, num_tags)
             # Sum over all possible current tags, but we're in log prob space, so a sum
             # becomes a log-sum-exp
-            score = self._log_sum_exp(score, 1)  # (batch_size, num_tags)
+            score = torch.logsumexp(score, 1)  # (batch_size, num_tags)
             # Set log_prob to the score if this timestep is valid (mask == 1), otherwise
             # leave it alone
             log_prob = score * mask[i].unsqueeze(1) + log_prob * (1.-mask[i]).unsqueeze(1)
 
         # End transition score
         log_prob += self.end_transitions.view(1, -1)
         # Sum (log-sum-exp) over all possible tags
-        return self._log_sum_exp(log_prob, 1)  # (batch_size,)
+        return torch.logsumexp(log_prob, 1)  # (batch_size,)
 
     def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) \
             -> List[List[int]]:
@@ -251,7 +244,7 @@ def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor)
 
         # Start transition
         viterbi_score = []
-        viterbi_score.append(self.start_transitions.data + emissions[0])
+        viterbi_score.append(self.start_transitions + emissions[0])
         viterbi_path = []
 
         # Here, viterbi_score is a list of tensors of shapes of (num_tags,) where value at
@@ -269,7 +262,7 @@ def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor)
             # Compute the score matrix of shape (batch_size, num_tags, num_tags) where
             # for each sample, each entry at row i and column j stores the score of
             # transitioning from tag i to tag j and emitting
-            score = broadcast_score + self.transitions.data + broadcast_emission
+            score = broadcast_score + self.transitions + broadcast_emission
             # Find the maximum score over all possible current tag
             best_score, best_path = score.max(1)  # (batch_size,num_tags,)
             # Save the score and the path
@@ -280,32 +273,17 @@ def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor)
         for idx in range(batch_size):
             # Find the tag which maximizes the score at the last timestep; this is our best tag
             # for the last timestep
-            seq_end = sequence_lengths[idx]-1
-            _, best_last_tag = (viterbi_score[seq_end][idx] + self.end_transitions.data).max(0)
-            best_tags = [best_last_tag[0]]
+            seq_end = sequence_lengths[idx] - 1
+            _, best_last_tag = (viterbi_score[seq_end][idx] + self.end_transitions).max(0)
+            best_tags = [best_last_tag.item()]
 
             # We trace back where the best last tag comes from, append that to our best tag
             # sequence, and trace it back again, and so on
             for path in reversed(viterbi_path[:sequence_lengths[idx] - 1]):
                 best_last_tag = path[idx][best_tags[-1]]
-                best_tags.append(best_last_tag)
+                best_tags.append(best_last_tag.item())
 
             # Reverse the order because we start from the last timestep
             best_tags.reverse()
             best_tags_list.append(best_tags)
         return best_tags_list
-
-    @staticmethod
-    def _log_sum_exp(tensor: Variable, dim: int) -> Variable:
-        # Find the max value along `dim`
-        offset, _ = tensor.max(dim)
-        # Make offset broadcastable
-        broadcast_offset = offset.unsqueeze(dim)
-        # Perform log-sum-exp safely
-        safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))
-        # Add offset back
-        return offset + safe_log_sum_exp
-
-    def _new(self, *args, **kwargs) -> Union[torch.FloatTensor, torch.cuda.FloatTensor]:
-        param = next(self.parameters())
-        return param.data.new(*args, **kwargs)
diff --git a/tests/test_crf.py b/tests/test_crf.py