TFIDFMeasure.cs
/*
* tf/idf implementation
* Author: Thanh Dao, thanh.dao@gmx.net
*/
using System;
using System.Collections;
using System.Collections.Generic;
using WawaSoft.Search.Common;
namespace WawaSoft.Search.Common
{
/// <summary>
/// Summary description for TF_IDFLib.
/// </summary>
public class TFIDFMeasure
{
private string[] _docs;
private string[][] _ngramDoc;
private int _numDocs=0;
private int _numTerms=0;
private ArrayList _terms;
private int[][] _termFreq;
private float[][] _termWeight;
private int[] _maxTermFreq;
private int[] _docFreq;
ITokeniser _tokenizer = null;
private IDictionary _wordsIndex=new Hashtable() ;
public TFIDFMeasure(string[] documents,ITokeniser tokeniser)
{
_docs=documents;
_numDocs=documents.Length ;
_tokenizer = tokeniser;
MyInit();
}
public int NumTerms
{
get { return _numTerms; }
set { _numTerms = value; }
}
private void GeneratNgramText()
{
}
private ArrayList GenerateTerms(string[] docs)
{
ArrayList uniques=new ArrayList() ;
_ngramDoc=new string[_numDocs][] ;
for (int i=0; i < docs.Length ; i++)
{
IList<string> words=_tokenizer.Partition(docs);
for (int j=0; j < words.Count; j++)
if (!uniques.Contains(words[j]) )
uniques.Add(words[j]) ;
}
return uniques;
}
private static object AddElement(IDictionary collection, object key, object newValue)
{
object element=collection[key];
collection[key]=newValue;
return element;
}
private int GetTermIndex(string term)
{
object index=_wordsIndex[term];
if (index == null) return -1;
return (int) index;
}
private void MyInit()
{
_terms=GenerateTerms (_docs );
NumTerms=_terms.Count ;
_maxTermFreq=new int[_numDocs] ;
_docFreq=new int[NumTerms] ;
_termFreq =new int[NumTerms][] ;
_termWeight=new float[NumTerms][] ;
for(int i=0; i < _terms.Count ; i++)
{
_termWeight=new float[_numDocs] ;
_termFreq=new int[_numDocs] ;
AddElement(_wordsIndex, _terms, i);
}
GenerateTermFrequency ();
GenerateTermWeight();
}
private float Log(float num)
{
return (float) Math.Log(num) ;//log2
}
private void GenerateTermFrequency()
{
for(int i=0; i < _numDocs ; i++)
{
string curDoc=_docs;
IDictionary freq=GetWordFrequency(curDoc);
IDictionaryEnumerator enums=freq.GetEnumerator() ;
_maxTermFreq=int.MinValue ;
while (enums.MoveNext())
{
string word=(string)enums.Key;
int wordFreq=(int)enums.Value ;
int termIndex=GetTermIndex(word);
if(termIndex == -1)
continue;
_termFreq [termIndex]=wordFreq;
_docFreq[termIndex] ++;
if (wordFreq > _maxTermFreq) _maxTermFreq=wordFreq;
}
}
}
private void GenerateTermWeight()
{
for(int i=0; i < NumTerms ; i++)
{
for(int j=0; j < _numDocs ; j++)
_termWeight[j]=ComputeTermWeight (i, j);
}
}
private float GetTermFrequency(int term, int doc)
{
int freq=_termFreq [term][doc];
int maxfreq=_maxTermFreq[doc];
return ( (float) freq/(float)maxfreq );
}
private float GetInverseDocumentFrequency(int term)
{
int df=_docFreq[term];
return Log((float) (_numDocs) / (float) df );
}
private float ComputeTermWeight(int term, int doc)
{
float tf=GetTermFrequency (term, doc);
float idf=GetInverseDocumentFrequency(term);
return tf * idf;
}
private float[] GetTermVector(int doc)
{
float[] w=new float[NumTerms] ;
for (int i=0; i < NumTerms; i++)
w=_termWeight[doc];
return w;
}
public double [] GetTermVector2(int doc)
{
double [] ret = new double[NumTerms];
float[] w = GetTermVector(doc);
for (int i = 0; i < ret.Length; i++ )
{
ret = w;
}
return ret;
}
public double GetSimilarity(int doc_i, int doc_j)
{
double [] vector1=GetTermVector2 (doc_i);
double [] vector2=GetTermVector2 (doc_j);
return TermVector.ComputeCosineSimilarity(vector1, vector2) ;
}
private IDictionary GetWordFrequency(string input)
{
string convertedInput=input.ToLower() ;
List<string> temp = new List<string>(_tokenizer.Partition(convertedInput));
string[] words= temp.ToArray();
Array.Sort(words);
String[] distinctWords=GetDistinctWords(words);
IDictionary result=new Hashtable();
for (int i=0; i < distinctWords.Length; i++)
{
object tmp;
tmp=CountWords(distinctWords, words);
result[distinctWords]=tmp;
}
return result;
}
private static string[] GetDistinctWords(String[] input)
{
if (input == null)
return new string[0];
else
{
List<string> list = new List<string>();
for (int i=0; i < input.Length; i++)
if (!list.Contains(input)) // N-GRAM SIMILARITY?
list.Add(input);
return list.ToArray();
}
}
private int CountWords(string word, string[] words)
{
int itemIdx=Array.BinarySearch(words, word);
if (itemIdx > 0)
while (itemIdx > 0 && words[itemIdx].Equals(word))
itemIdx--;
int count=0;
while (itemIdx < words.Length && itemIdx >= 0)
{
if (words[itemIdx].Equals(word)) count++;
itemIdx++;
if (itemIdx < words.Length)
if (!words[itemIdx].Equals(word)) break;
}
return count;
}
}
} |