/***************************************************************************
 *   Project: GraphBlast                                                   *
 *   File: MinHashing class, Signature class                               *
 *   Date: 04-May-2005                                                     *
 ***************************************************************************/

#ifndef MINHASH_H
#define MINHASH_H

#include "common.h"
#include "primes.h"

extern unsigned long m_prime;
extern unsigned long primes[N_PRIMES];

/////////////////////////////////////////////////////////////////////////
//  Signature                                                          //
/////////////////////////////////////////////////////////////////////////

class Signature
{
   string pattern;
   int length;
   vector<int> sig; // classic signature
   vector<int> inner_sup; // take into account also inner support
   int n_gr;
public:
   Signature();
   explicit Signature(string pattern){this->pattern=pattern;}
   explicit Signature(char* pattern){this->pattern=string(pattern);}
   
   string get_pattern(){return this->pattern;}
   int get_length(){return sig.size();}

   //Type TSignature must have this public members
   void init(int length, int value)
   {
      this->length=length;
      sig.assign(length,value);
      inner_sup.assign(length,0);
   }
   int get_value(int i){return inner_sup[i];}
   void set_value(int i, int value){inner_sup[i]=value;}
   int& operator[](const int& idx){return sig[idx];}
   //End of TSignature
 
   void set_n_graphs(int ng){n_gr=ng;}
   int n_graphs(){return n_gr;}

   string to_string()
   {
      string s;
      char buf[100];
      
      for(int k=0; k<sig.size(); k++)
      {
        sprintf(buf," %d(%d)",sig[k],inner_sup[k]);
        s+=buf;
      }
      return s;
   }

   friend bool operator==(Signature& s1, Signature& s2);
   friend bool operator!=(Signature& s1, Signature& s2);
   friend float Similarity(Signature& s1, Signature& s2);

private:

};

bool operator==(Signature& s1, Signature& s2)
{
  if(s1.length != s2.length)
     return false;

  bool eq = true;    
  for(int i=0; i<s1.length;i++)
     if(s1.sig[i] != s2.sig[i] || s1.inner_sup[i] != s2.inner_sup[i])
        eq = false;
  
  return eq;
}

bool operator!=(Signature& sig1, Signature& sig2)
{
   return !(sig1==sig2);
}

float Similarity(Signature& s1, Signature& s2)
{
   int count=0;
   
   for(int i=0; i<(s1.length>s2.length?s2.length:s1.length); i++)
     if(s1.sig[i] == s2.sig[i]) count++;
   
   return static_cast<float>(count)/(s1.length+s2.length-count);  
}

/////////////////////////////////////////////////////////////////////////
//  Min Hashing                                                        //
/////////////////////////////////////////////////////////////////////////
template<typename TSignature>
class MinHashing
{
   int K; // number of permutations
   int M; // base for hashing (it's much better than it's prime)
   vector<int> seed; // seeds for hashing (must be primes if we don't want equal hashes)

public:    
   explicit MinHashing(int K):K(K)
   {
      M = m_prime;
      for(int i=0;i<N_PRIMES;i++) 
         seed.push_back(primes[i]);
      random_shuffle(seed.begin(),seed.end()); // because part of numbres primes.h are generated with get_next_prime()
   }
 
   void InitSignature(TSignature& tsig)
   {
      tsig.init(K,M+1); //do_hash() returns value in range 0..M-1
   }
  
   void Hash(TSignature& tsig, int row, int value)
   {
      for(int i=0; i<K; i++)
      {
         int h=do_hash(i,row);
         if(h<tsig[i]) 
         {
            tsig[i]=h;
            tsig.set_value(i,value);
         }
      }
   }

private:
   // POSIX recomendation (but as always is based on Knuth)
   inline int do_hash(int k,int value)
   {
      unsigned long h = (value+1)*seed[k] + 12345;
      return h % M;
   }
};

#endif
