Friday, April 22, 2016

N gram

#include
#include
#include
#include
#include
#include
#include
#include

using namespace std;
const string START_SYMBOL = "#@#";
const string END_SYMBOL = "@!#@";
class Ngram {
 public:
  Ngram(vector data, int n) {
    int size = data.size();
    for(int j = 0;j    {
        string curr_string = data[j];
        vector words = split_string_by_space(curr_string)
        for (int i = 0; i < words.size() - n + 1; i++)
            ngrams[concat(words[i], i, i+n)]++;
    }
  }
 
 

  double LogProb(const string& sentence) {
    // IMPLEMENT ME
    double prob = 1.0;
    vector words = split_string_by_space(sentence);
    int n = words.size();
    for(int i = 0;i    {
        string curr_str;
        if(i==0) curr_str = START_SYMBOL + words[i];
        else if(i==n-1) curr_str = words[i] + END_SYMBOL;
        else curr_str = words[i];
       
        if(i>=1)
            string pre_str = words[i] + " " + words[i-1];
        else
            string pre_str = words[i] + END_SYMBOL;
        prob = prob * ngrams[curr_str]./ngrams[pre_str];
    }
    return prob;
  }
 
  vector split_string_by_space(string curr_string)
  {
      vector words;
      int k = 0;
      int i = 0;
      while(k      {
          if(curr_string[k]!=' ') k++;
          else
          {
             words.push_back(curr_string.substr(i,k-i));
             i = k;
             k++;
          }
      }
  }
 
  string concat(string words, int start, int end) {
     
        string sb;
        for (int i = start; i < end; i++)
        {
            if(i==start) sb = words[i];
            else sb = sb+ " " + words[i];
        }
        if(start == 0) sb= START_SYMBOL + sb;
        if(end == words.size()-1) sb = sb + END_SYMBOL;
       
        return sb;
    }


  unordered_mapngrams;
};


#ifndef __main__
#define __main__
int main() {
  vector data = {
    "I am Sam",
    "Sam I am",
    "Sam likes green eggs and ham",
    "I like green eggs and pizza",
    "I do not like green eggs and ham",
    "I do not eat green eggs and ham",
  };

  Ngram ngram(data, 3);
  // should print the log (natural log) probability of the sentence
  cout << ngram.LogProb("I like green eggs and ham") << endl;
  return 0;
}
#endif

No comments: