I have Develop application for near duplicate detection in c#.This application work for strings only but it's not working for pdf files.may be i think GetSimilarity method can not work properly but error not be raised.
my application code like as:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using iTextSharp.text;
using System.Threading;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.Text.RegularExpressions;
using WindowsFormsApplication1.appcode;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
string filename;
FileInfo[] data1;
FileInfo[] data2;
string path;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog = new OpenFileDialog();
openFileDialog.CheckFileExists = true;
openFileDialog.AddExtension = true;
openFileDialog.Filter = "PDF files (*.pdf)|*.pdf";
DialogResult result = openFileDialog.ShowDialog();
if (result == DialogResult.OK)
{
filename = Path.GetFileName(openFileDialog.FileName);
path = Path.GetDirectoryName(openFileDialog.FileName);
textBox1.Text = path + "\\" + filename;
}
}
private void button2_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog = new OpenFileDialog();
openFileDialog.CheckFileExists = true;
openFileDialog.AddExtension = true;
openFileDialog.Filter = "PDF files (*.pdf)|*.pdf";
DialogResult result = openFileDialog.ShowDialog();
if (result == DialogResult.OK)
{
filename = Path.GetFileName(openFileDialog.FileName);
path = Path.GetDirectoryName(openFileDialog.FileName);
textBox2.Text = path + "\\" + filename;
}
}
public static string ExtractTextFromPdf(string filename)
{
using (PdfReader r = new PdfReader(filename))
{
StringBuilder text = new StringBuilder();
for (int i = 1; i <= r.NumberOfPages; i++)
{
text.Append(PdfTextExtractor.GetTextFromPage(r, i));
}
string first = text.ToString();
return first;
}
}
public static string Extract(string filename)
{
using (PdfReader r = new PdfReader(filename))
{
StringBuilder text = new StringBuilder();
for (int i = 1; i <= r.NumberOfPages; i++)
{
text.Append(PdfTextExtractor.GetTextFromPage(r, i));
}
string second = text.ToString();
return second;
}
}
private void button3_Click(object sender, EventArgs e)
{
StopWordsHandler stopword = new StopWordsHandler();
string s = ExtractTextFromPdf(textBox1.Text);
string s1 = Extract(textBox2.Text);
string[] doc = new string[2]{s,s1 };
TFIDF tfidf = new TFIDF(doc);
float fl = tfidf.GetSimilarity(0,1);
var sformatted = string.Format("Value: {0:P2}.", fl);
}
StopWordsHandler.cs:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace WindowsFormsApplication1.appcode
{
class StopWordsHandler
{
public static string[] stopWordsList=new string[] {
"a",
"about",
"above",
"across",
"afore",
"aforesaid",
"after",
"again",
"against",
"agin",
"ago",
"aint",
"albeit",
"all",
"almost",
"alone",
"along",
"alongside",
"already",
"also",
"although",
"always",
"am",
"american",
"amid",
"amidst",
"among",
"amongst",
"an",
"and",
"anent",
"another",
"any",
"anybody",
"anyone",
"anything",
"are",
"aren't",
"around",
"as",
"aslant",
"astride",
"at",
"athwart",
"away",
"b",
"back",
"bar",
"barring",
"be",
"because",
"been",
"before",
"behind",
"being",
"below",
"beneath",
"beside",
"besides",
"best",
"better",
"between",
"betwixt",
"beyond",
"both",
"but",
"by",
"c",
"can",
"cannot",
"can't",
"certain",
"circa",
"close",
"concerning",
"considering",
"cos",
"could",
"couldn't",
"couldst",
"d",
"dare",
"dared",
"daren't",
"dares",
"daring",
"despite",
"did",
"didn't",
"different",
"directly",
"do",
"does",
"doesn't",
"doing",
"done",
"don't",
"dost",
"doth",
"down",
"during",
"durst",
"e",
"each",
"early",
"either",
"em",
"english",
"enough",
"ere",
"even",
"ever",
"every",
"everybody",
"everyone",
"everything",
"except",
"excepting",
"f",
"failing",
"far",
"few",
"first",
"five",
"following",
"for",
"four",
"from",
"g",
"gonna",
"gotta",
"h",
"had",
"hadn't",
"hard",
"has",
"hasn't",
"hast",
"hath",
"have",
"haven't",
"having",
"he",
"he'd",
"he'll",
"her",
"here",
"here's",
"hers",
"herself",
"he's",
"high",
"him",
"himself",
"his",
"home",
"how",
"howbeit",
"however",
"how's",
"i",
"id",
"if",
"ill",
"i'm",
"immediately",
"important",
"in",
"inside",
"instantly",
"into",
"is",
"isn't",
"it",
"it'll",
"it's",
"its",
"itself",
"i've",
"j",
"just",
"k",
"l",
"large",
"last",
"later",
"least",
"left",
"less",
"lest",
"let's",
"like",
"likewise",
"little",
"living",
"long",
"m",
"many",
"may",
"mayn't",
"me",
"mid",
"midst",
"might",
"mightn't",
"mine",
"minus",
"more",
"most",
"much",
"must",
"mustn't",
"my",
"myself",
"n",
"near",
"'neath",
"need",
"needed",
"needing",
"needn't",
"needs",
"neither",
"never",
"nevertheless",
"new",
"next",
"nigh",
"nigher",
"nighest",
"nisi",
"no",
"no-one",
"nobody",
"none",
"nor",
"not",
"nothing",
"notwithstanding",
"now",
"o",
"o'er",
"of",
"off",
"often",
"on",
"once",
"one",
"oneself",
"only",
"onto",
"open",
"or",
"other",
"otherwise",
"ought",
"oughtn't",
"our",
"ours",
"ourselves",
"out",
"outside",
"over",
"own",
"p",
"past",
"pending",
"per",
"perhaps",
"plus",
"possible",
"present",
"probably",
"provided",
"providing",
"public",
"q",
"qua",
"quite",
"r",
"rather",
"re",
"real",
"really",
"respecting",
"right",
"round",
"s",
"same",
"sans",
"save",
"saving",
"second",
"several",
"shall",
"shalt",
"shan't",
"she",
"shed",
"shell",
"she's",
"short",
"should",
"shouldn't",
"since",
"six",
"small",
"so",
"some",
"somebody",
"someone",
"something",
"sometimes",
"soon",
"special",
"still",
"such",
"summat",
"supposing",
"sure",
"t",
"than",
"that",
"that'd",
"that'll",
"that's",
"the",
"thee",
"their",
"theirs",
"their's",
"them",
"themselves",
"then",
"there",
"there's",
"these",
"they",
"they'd",
"they'll",
"they're",
"they've",
"thine",
"this",
"tho",
"those",
"thou",
"though",
"three",
"thro'",
"through",
"throughout",
"thru",
"thyself",
"till",
"to",
"today",
"together",
"too",
"touching",
"toward",
"towards",
"true",
"'twas",
"'tween",
"'twere",
"'twill",
"'twixt",
"two",
"'twould",
"u",
"under",
"underneath",
"unless",
"unlike",
"until",
"unto",
"up",
"upon",
"us",
"used",
"usually",
"v",
"versus",
"very",
"via",
"vice",
"vis-a-vis",
"w",
"wanna",
"wanting",
"was",
"wasn't",
"way",
"we",
"we'd",
"well",
"were",
"weren't",
"wert",
"we've",
"what",
"whatever",
"what'll",
"what's",
"when",
"whencesoever",
"whenever",
"when's",
"whereas",
"where's",
"whether",
"which",
"whichever",
"whichsoever",
"while",
"whilst",
"who",
"who'd",
"whoever",
"whole",
"who'll",
"whom",
"whore",
"who's",
"whose",
"whoso",
"whosoever",
"will",
"with",
"within",
"without",
"wont",
"would",
"wouldn't",
"wouldst",
"x",
"y",
"ye",
"yet",
"you",
"you'd",
"you'll",
"your",
"you're",
"yours",
"yourself",
"yourselves",
"you've",
"z",
} ;
private static Hashtable _stopwords=null;
public static object AddElement(IDictionary collection,Object key, object newValue)
{
object element = collection[key];
collection[key] = newValue;
return element;
}
public static bool IsStopword(string str)
{
return _stopwords.ContainsKey(str);
}
public StopWordsHandler()
{
if (_stopwords == null)
{
_stopwords = new Hashtable();
double dummy = 0;
foreach (string word in stopWordsList)
{
AddElement(_stopwords, word, dummy);
}
}
}
}
}
TFIDF.cs:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace WindowsFormsApplication1.appcode
{
class TFIDF
{
private string[] _docs;
private string[][] _ngramDoc;
private int _numDocs=0;
private int _numTerms=0;
private ArrayList _terms;
private int[][] _termFreq;
private float[][] _termWeight;
private int[] _maxTermFreq;
private int[] _docFreq;
public class TermVector
{
public static float ComputeCosineSimilarity(float[] vector1, float[] vector2)
{
if (vector1.Length != vector2.Length)
throw new Exception("DIFER LENGTH");
float denom=(VectorLength(vector1) * VectorLength(vector2));
if (denom == 0F)
return 0F;
else
return (InnerProduct(vector1, vector2) / denom);
}
public static float InnerProduct(float[] vector1, float[] vector2)
{
if (vector1.Length != vector2.Length)
throw new Exception("DIFFER LENGTH ARE NOT ALLOWED");
float result=0F;
for (int i=0; i < vector1.Length; i++)
result += vector1[i] * vector2[i];
return result;
}
public static float VectorLength(float[] vector)
{
float sum=0.0F;
for (int i=0; i < vector.Length; i++)
sum=sum + (vector[i] * vector[i]);
return (float)Math.Sqrt(sum);
}
}
private IDictionary _wordsIndex=new Hashtable() ;
public TFIDF(string[] documents)
{
_docs=documents;
_numDocs=documents.Length ;
MyInit();
}
private void GeneratNgramText()
{
}
private ArrayList GenerateTerms(string[] docs)
{
ArrayList uniques=new ArrayList() ;
_ngramDoc=new string[_numDocs][] ;
for (int i=0; i < docs.Length ; i++)
{
Tokeniser tokenizer=new Tokeniser() ;
string[] words=tokenizer.Partition(docs[i]);
for (int j=0; j < words.Length ; j++)
if (!uniques.Contains(words[j]) )
uniques.Add(words[j]) ;
}
return uniques;
}
private static object AddElement(IDictionary collection, object key, object newValue)
{
object element=collection[key];
collection[key]=newValue;
return element;
}
private int GetTermIndex(string term)
{
object index=_wordsIndex[term];
if (index == null) return -1;
return (int) index;
}
private void MyInit()
{
_terms=GenerateTerms (_docs );
_numTerms=_terms.Count ;
_maxTermFreq=new int[_numDocs] ;
_docFreq=new int[_numTerms] ;
_termFreq =new int[_numTerms][] ;
_termWeight=new float[_numTerms][] ;
for(int i=0; i < _terms.Count ; i++)
{
_termWeight[i]=new float[_numDocs] ;
_termFreq[i]=new int[_numDocs] ;
AddElement(_wordsIndex, _terms[i], i);
}
GenerateTermFrequency ();
GenerateTermWeight();
}
private float Log(float num)
{
return (float) Math.Log(num) ;
}
private void GenerateTermFrequency()
{
for(int i=0; i < _numDocs ; i++)
{
string curDoc=_docs[i];
IDictionary freq=GetWordFrequency(curDoc);
IDictionaryEnumerator enums=freq.GetEnumerator() ;
_maxTermFreq[i]=int.MinValue ;
while (enums.MoveNext())
{
string word=(string)enums.Key;
int wordFreq=(int)enums.Value ;
int termIndex=GetTermIndex(word);
_termFreq [termIndex][i]=wordFreq;
_docFreq[termIndex] ++;
if (wordFreq > _maxTermFreq[i]) _maxTermFreq[i]=wordFreq;
}
}
}
private void GenerateTermWeight()
{
for(int i=0; i < _numTerms ; i++)
{
for(int j=0; j < _numDocs ; j++)
_termWeight[i][j]=ComputeTermWeight (i, j);
}
}
private float GetTermFrequency(int term, int doc)
{
int freq=_termFreq [term][doc];
int maxfreq=_maxTermFreq[doc];
return ( (float) freq/(float)maxfreq );
}
private float GetInverseDocumentFrequency(int term)
{
int df=_docFreq[term];
return Log((float) (_numDocs) / (float) df );
}
private float ComputeTermWeight(int term, int doc)
{
float tf=GetTermFrequency (term, doc);
float idf=GetInverseDocumentFrequency(term);
return tf * idf;
}
private float[] GetTermVector(int doc)
{
float[] w=new float[_numTerms] ;
for (int i=0; i < _numTerms; i++)
w[i]=_termWeight[i][doc];
return w;
}
public float GetSimilarity(int doc_i, int doc_j)
{
float[] vector1=GetTermVector (doc_i);
float[] vector2=GetTermVector (doc_j);
return TermVector.ComputeCosineSimilarity(vector1, vector2) ;
}
private IDictionary GetWordFrequency(string input)
{
Tokeniser tokenizer=new Tokeniser() ;
String[] words=tokenizer.Partition(input);
Array.Sort(words);
String[] distinctWords=GetDistinctWords(words);
IDictionary result=new Hashtable();
for (int i=0; i < distinctWords.Length; i++)
{
object tmp;
tmp=CountWords(distinctWords[i], words);
result[distinctWords[i]]=tmp;
}
return result;
}
private string[] GetDistinctWords(String[] input)
{
if (input == null)
return new string[0];
else
{
ArrayList list=new ArrayList() ;
for (int i=0; i < input.Length; i++)
if (!list.Contains(input[i]))
list.Add(input[i]);
return Tokeniser.ArrayListToArray(list) ;
}
}
private int CountWords(string word, string[] words)
{
int itemIdx=Array.BinarySearch(words, word);
if (itemIdx > 0)
while (itemIdx > 0 && words[itemIdx].Equals(word))
itemIdx--;
int count=0;
while (itemIdx < words.Length && itemIdx >= 0)
{
if (words[itemIdx].Equals(word)) count++;
itemIdx++;
if (itemIdx < words.Length)
if (!words[itemIdx].Equals(word)) break;
}
return count;
}
}
}
Tokeniser.cs:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Collections;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
namespace WindowsFormsApplication1.appcode
{
class Tokeniser
{
public static string[] ArrayListToArray(ArrayList arraylist)
{
string[] array = new string[arraylist.Count];
for (int i = 0; i < arraylist.Count; i++) array[i] = (string)arraylist[i];
return array;
}
public string[] Partition(string input)
{
Regex r = new Regex("([ \\t{}():;. \n])");
String[] tokens = r.Split(input);
ArrayList filter = new ArrayList();
for (int i = 0; i < tokens.Length; i++)
{
MatchCollection mc = r.Matches(tokens[i]);
if (mc.Count <= 0 && tokens[i].Trim().Length > 0
&& !StopWordsHandler.IsStopword(tokens[i]))
filter.Add(tokens[i]);
}
return ArrayListToArray(filter);
}
public Tokeniser()
{
}
}
}
button3 is compare functionality.in this scope i have to write similarity logic in terms of percentage.
oncle please check the code for similarty b/w the two pdf files.if any probelm please intimate.please help me.
thank u.