/usr/share/octave/packages/statistics-1.3.0/hmmestimate.m

## Copyright (C) 2006, 2007 Arno Onken <asnelt@asnelt.org>
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn {Function File} {[@var{transprobest}, @var{outprobest}] =} hmmestimate (@var{sequence}, @var{states})
## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'statenames', @var{statenames})
## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'symbols', @var{symbols})
## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudotransitions', @var{pseudotransitions})
## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudoemissions', @var{pseudoemissions})
## Estimate the matrix of transition probabilities and the matrix of output
## probabilities of a given sequence of outputs and states generated by a
## hidden Markov model. The model assumes that the generation starts in
## state @code{1} at step @code{0} but does not include step @code{0} in the
## generated states and sequence.
##
## @subheading Arguments
##
## @itemize @bullet
## @item
## @var{sequence} is a vector of a sequence of given outputs. The outputs
## must be integers ranging from @code{1} to the number of outputs of the
## hidden Markov model.
##
## @item
## @var{states} is a vector of the same length as @var{sequence} of given
## states. The states must be integers ranging from @code{1} to the number
## of states of the hidden Markov model.
## @end itemize
##
## @subheading Return values
##
## @itemize @bullet
## @item
## @var{transprobest} is the matrix of the estimated transition
## probabilities of the states. @code{transprobest(i, j)} is the estimated
## probability of a transition to state @code{j} given state @code{i}.
##
## @item
## @var{outprobest} is the matrix of the estimated output probabilities.
## @code{outprobest(i, j)} is the estimated probability of generating
## output @code{j} given state @code{i}.
## @end itemize
##
## If @code{'symbols'} is specified, then @var{sequence} is expected to be a
## sequence of the elements of @var{symbols} instead of integers.
## @var{symbols} can be a cell array.
##
## If @code{'statenames'} is specified, then @var{states} is expected to be
## a sequence of the elements of @var{statenames} instead of integers.
## @var{statenames} can be a cell array.
##
## If @code{'pseudotransitions'} is specified then the integer matrix
## @var{pseudotransitions} is used as an initial number of counted
## transitions. @code{pseudotransitions(i, j)} is the initial number of
## counted transitions from state @code{i} to state @code{j}.
## @var{transprobest} will have the same size as @var{pseudotransitions}.
## Use this if you have transitions that are very unlikely to occur.
##
## If @code{'pseudoemissions'} is specified then the integer matrix
## @var{pseudoemissions} is used as an initial number of counted outputs.
## @code{pseudoemissions(i, j)} is the initial number of counted outputs
## @code{j} given state @code{i}. If @code{'pseudoemissions'} is also
## specified then the number of rows of @var{pseudoemissions} must be the
## same as the number of rows of @var{pseudotransitions}. @var{outprobest}
## will have the same size as @var{pseudoemissions}. Use this if you have
## outputs or states that are very unlikely to occur.
##
## @subheading Examples
##
## @example
## @group
## transprob = [0.8, 0.2; 0.4, 0.6];
## outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1];
## [sequence, states] = hmmgenerate (25, transprob, outprob);
## [transprobest, outprobest] = hmmestimate (sequence, states)
## @end group
##
## @group
## symbols = @{'A', 'B', 'C'@};
## statenames = @{'One', 'Two'@};
## [sequence, states] = hmmgenerate (25, transprob, outprob,
##                      'symbols', symbols, 'statenames', statenames);
## [transprobest, outprobest] = hmmestimate (sequence, states,
##                              'symbols', symbols,
##                              'statenames', statenames)
## @end group
##
## @group
## pseudotransitions = [8, 2; 4, 6];
## pseudoemissions = [2, 4, 4; 7, 2, 1];
## [sequence, states] = hmmgenerate (25, transprob, outprob);
## [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions)
## @end group
## @end example
##
## @subheading References
##
## @enumerate
## @item
## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics
## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC,
## 2001.
##
## @item
## Lawrence R. Rabiner. A Tutorial on Hidden Markov Models and Selected
## Applications in Speech Recognition. @cite{Proceedings of the IEEE},
## 77(2), pages 257-286, February 1989.
## @end enumerate
## @end deftypefn

## Author: Arno Onken <asnelt@asnelt.org>
## Description: Estimation of a hidden Markov model for a given sequence

function [transprobest, outprobest] = hmmestimate (sequence, states, varargin)

  # Check arguments
  if (nargin < 2 || mod (length (varargin), 2) != 0)
    print_usage ();
  endif

  len = length (sequence);
  if (length (states) != len)
    error ("hmmestimate: sequence and states must have equal length");
  endif

  # Flag for symbols
  usesym = false;
  # Flag for statenames
  usesn = false;

  # Variables for return values
  transprobest = [];
  outprobest = [];

  # Process varargin
  for i = 1:2:length (varargin)
    # There must be an identifier: 'symbols', 'statenames',
    # 'pseudotransitions' or 'pseudoemissions'
    if (! ischar (varargin{i}))
      print_usage ();
    endif
    # Upper case is also fine
    lowerarg = lower (varargin{i});
    if (strcmp (lowerarg, 'symbols'))
      usesym = true;
      # Use the following argument as symbols
      symbols = varargin{i + 1};
    # The same for statenames
    elseif (strcmp (lowerarg, 'statenames'))
      usesn = true;
      # Use the following argument as statenames
      statenames = varargin{i + 1};
    elseif (strcmp (lowerarg, 'pseudotransitions'))
      # Use the following argument as an initial count for transitions
      transprobest = varargin{i + 1};
      if (! ismatrix (transprobest))
        error ("hmmestimate: pseudotransitions must be a non-empty numeric matrix");
      endif
      if (rows (transprobest) != columns (transprobest))
        error ("hmmestimate: pseudotransitions must be a square matrix");
      endif
    elseif (strcmp (lowerarg, 'pseudoemissions'))
      # Use the following argument as an initial count for outputs
      outprobest = varargin{i + 1};
      if (! ismatrix (outprobest))
        error ("hmmestimate: pseudoemissions must be a non-empty numeric matrix");
      endif
    else
      error ("hmmestimate: expected 'symbols', 'statenames', 'pseudotransitions' or 'pseudoemissions' but found '%s'", varargin{i});
    endif
  endfor

  # Transform sequence from symbols to integers if necessary
  if (usesym)
    # sequenceint is used to build the transformed sequence
    sequenceint = zeros (1, len);
    for i = 1:length (symbols)
      # Search for symbols(i) in the sequence, isequal will have 1 at
      # corresponding indices; i is the right integer for that symbol
      isequal = ismember (sequence, symbols(i));
      # We do not want to change sequenceint if the symbol appears a second
      # time in symbols
      if (any ((sequenceint == 0) & (isequal == 1)))
        isequal *= i;
        sequenceint += isequal;
      endif
    endfor
    if (! all (sequenceint))
      index = max ((sequenceint == 0) .* (1:len));
      error (["hmmestimate: sequence(" int2str (index) ") not in symbols"]);
    endif
    sequence = sequenceint;
  else
    if (! isvector (sequence))
      error ("hmmestimate: sequence must be a non-empty vector");
    endif
    if (! all (ismember (sequence, 1:max (sequence))))
      index = max ((ismember (sequence, 1:max (sequence)) == 0) .* (1:len));
      error (["hmmestimate: sequence(" int2str (index) ") not feasible"]);
    endif
  endif

  # Transform states from statenames to integers if necessary
  if (usesn)
    # statesint is used to build the transformed states
    statesint = zeros (1, len);
    for i = 1:length (statenames)
      # Search for statenames(i) in states, isequal will have 1 at
      # corresponding indices; i is the right integer for that statename
      isequal = ismember (states, statenames(i));
      # We do not want to change statesint if the statename appears a second
      # time in statenames
      if (any ((statesint == 0) & (isequal == 1)))
        isequal *= i;
        statesint += isequal;
      endif
    endfor
    if (! all (statesint))
      index = max ((statesint == 0) .* (1:len));
      error (["hmmestimate: states(" int2str (index) ") not in statenames"]);
    endif
    states = statesint;
  else
    if (! isvector (states))
      error ("hmmestimate: states must be a non-empty vector");
    endif
    if (! all (ismember (states, 1:max (states))))
      index = max ((ismember (states, 1:max (states)) == 0) .* (1:len));
      error (["hmmestimate: states(" int2str (index) ") not feasible"]);
    endif
  endif

  # Estimate the number of different states as the max of states  
  nstate = max (states);
  # Estimate the number of different outputs as the max of sequence
  noutput = max (sequence);

  # transprobest is empty if pseudotransitions is not specified
  if (isempty (transprobest))
    # outprobest is not empty if pseudoemissions is specified
    if (! isempty (outprobest))
      if (nstate > rows (outprobest))
        error ("hmmestimate: not enough rows in pseudoemissions");
      endif
      # The number of states is specified by pseudoemissions
      nstate = rows (outprobest);
    endif
    transprobest = zeros (nstate, nstate);
  else
    if (nstate > rows (transprobest))
      error ("hmmestimate: not enough rows in pseudotransitions");
    endif
    # The number of states is given by pseudotransitions
    nstate = rows (transprobest);
  endif

  # outprobest is empty if pseudoemissions is not specified
  if (isempty (outprobest))
    outprobest = zeros (nstate, noutput);
  else
    if (noutput > columns (outprobest))
      error ("hmmestimate: not enough columns in pseudoemissions");
    endif
    # Number of outputs is specified by pseudoemissions
    noutput = columns (outprobest);
    if (rows (outprobest) != nstate)
      error ("hmmestimate: pseudoemissions must have the same number of rows as pseudotransitions");
    endif
  endif

  # Assume that the model started in state 1
  cstate = 1;
  for i = 1:len
    # Count the number of transitions for each state pair
    transprobest(cstate, states(i)) ++;
    cstate = states (i);
    # Count the number of outputs for each state output pair
    outprobest(cstate, sequence(i)) ++;
  endfor

  # transprobest and outprobest contain counted numbers
  # Each row in transprobest and outprobest should contain estimated
  # probabilities
  # => scale so that the sum is 1
  # A zero row remains zero
  # - for transprobest
  s = sum (transprobest, 2);
  s(s == 0) = 1;
  transprobest = transprobest ./ (s * ones (1, nstate));
  # - for outprobest
  s = sum (outprobest, 2);
  s(s == 0) = 1;
  outprobest = outprobest ./ (s * ones (1, noutput));

endfunction

%!test
%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3];
%! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1];
%! [transprobest, outprobest] = hmmestimate (sequence, states);
%! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429];
%! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000];
%! assert (transprobest, expectedtransprob, 0.001);
%! assert (outprobest, expectedoutprob, 0.001);

%!test
%! sequence = {'A', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'C', 'C', 'C', 'C', 'B', 'C', 'A', 'A', 'A', 'A', 'C', 'C', 'B', 'C', 'A', 'C'};
%! states = {'One', 'One', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'Two', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One'};
%! symbols = {'A', 'B', 'C'};
%! statenames = {'One', 'Two'};
%! [transprobest, outprobest] = hmmestimate (sequence, states, 'symbols', symbols, 'statenames', statenames);
%! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429];
%! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000];
%! assert (transprobest, expectedtransprob, 0.001);
%! assert (outprobest, expectedoutprob, 0.001);

%!test
%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3];
%! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1];
%! pseudotransitions = [8, 2; 4, 6];
%! pseudoemissions = [2, 4, 4; 7, 2, 1];
%! [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions);
%! expectedtransprob = [0.85714, 0.14286; 0.35294, 0.64706];
%! expectedoutprob = [0.178571, 0.357143, 0.464286; 0.823529, 0.117647, 0.058824];
%! assert (transprobest, expectedtransprob, 0.001);
%! assert (outprobest, expectedoutprob, 0.001);
octave-statistics 1.3.0-1 / usr / share / octave / packages / statistics-1.3.0 / hmmestimate.m