problem in near duplicate detection

Question

1.00/5 (1 vote)

See more:

I have Develop application for near duplicate detection in c#.This application work for strings only but it's not working for pdf files.may be i think GetSimilarity method can not work properly but error not be raised.
my application code like as:

C#

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using iTextSharp.text;
using System.Threading;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.Text.RegularExpressions;
using WindowsFormsApplication1.appcode;

namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        string filename;
        FileInfo[] data1;
        FileInfo[] data2;
        string path;
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            OpenFileDialog openFileDialog = new OpenFileDialog();
            openFileDialog.CheckFileExists = true;
            openFileDialog.AddExtension = true;
            openFileDialog.Filter = "PDF files (*.pdf)|*.pdf";
            DialogResult result = openFileDialog.ShowDialog();
            if (result == DialogResult.OK)
            {
                filename = Path.GetFileName(openFileDialog.FileName);
                path = Path.GetDirectoryName(openFileDialog.FileName);
                textBox1.Text = path + "\\" + filename;
               
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {

            OpenFileDialog openFileDialog = new OpenFileDialog();
            openFileDialog.CheckFileExists = true;
            openFileDialog.AddExtension = true;
            openFileDialog.Filter = "PDF files (*.pdf)|*.pdf";
            DialogResult result = openFileDialog.ShowDialog();
            if (result == DialogResult.OK)
            {
                filename = Path.GetFileName(openFileDialog.FileName);
                path = Path.GetDirectoryName(openFileDialog.FileName);
                textBox2.Text = path + "\\" + filename;

            }
        }

        public static string ExtractTextFromPdf(string filename)
        {
            using (PdfReader r = new PdfReader(filename))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= r.NumberOfPages; i++)
                {
                    text.Append(PdfTextExtractor.GetTextFromPage(r, i));
                }

                string first = text.ToString();
                return first;
            }
        }
        public static string Extract(string filename)
        {
            using (PdfReader r = new PdfReader(filename))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= r.NumberOfPages; i++)
                {
                    text.Append(PdfTextExtractor.GetTextFromPage(r, i));
                }

                string second = text.ToString();
                return second;
            }
        }

       private void button3_Click(object sender, EventArgs e)
        {
            StopWordsHandler stopword = new StopWordsHandler();
            string s = ExtractTextFromPdf(textBox1.Text);
            string s1 = Extract(textBox2.Text);
            string[] doc = new string[2]{s,s1 };
            TFIDF tfidf = new TFIDF(doc);
            float fl = tfidf.GetSimilarity(0,1);
            var sformatted = string.Format("Value: {0:P2}.", fl);
           
        }

StopWordsHandler.cs:

C#

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace WindowsFormsApplication1.appcode
{
    class StopWordsHandler
    {
        public static string[] stopWordsList=new string[] {

															  "a", 																													  
															  "about", 
															  "above", 
															  "across", 
															  "afore", 
															  "aforesaid",
                                                              "after", 
                                                              "again", 
                                                              "against", 
                                                              "agin", 
                                                              "ago", 
                                                              "aint", 
                                                              "albeit", 
                                                              "all", 
                                                              "almost", 
                                                              "alone", 
                                                              "along", 
                                                              "alongside", 
                                                              "already", 
                                                              "also", 
                                                              "although", 
                                                              "always", 
                                                              "am", 
                                                              "american", 
                                                              "amid", 
                                                              "amidst", 
                                                              "among", 
                                                              "amongst", 
                                                              "an", 
                                                              "and", 
                                                              "anent", 
                                                              "another", 
                                                              "any", 
                                                              "anybody", 
                                                              "anyone", 
                                                              "anything", 
                                                              "are", 
                                                              "aren't", 
                                                              "around", 
                                                              "as", 
                                                              "aslant", 
                                                              "astride", 
                                                              "at", 
                                                              "athwart", 
                                                              "away", 
                                                              "b", 
                                                              "back", 
                                                              "bar", 
                                                              "barring", 
                                                              "be", 
                                                              "because", 
                                                              "been", 
                                                              "before", 
                                                              "behind", 
                                                              "being", 
                                                              "below", 
                                                              "beneath", 
                                                              "beside", 
                                                              "besides", 
                                                              "best", 
                                                              "better", 
                                                              "between", 
                                                              "betwixt", 
                                                              "beyond", 
                                                              "both", 
                                                              "but", 
                                                              "by", 
                                                              "c", 
                                                              "can", 
                                                              "cannot", 
                                                              "can't", 
                                                              "certain", 
                                                              "circa", 
                                                              "close", 
                                                              "concerning", 
                                                              "considering", 
                                                              "cos", 
                                                              "could", 
                                                              "couldn't", 
                                                              "couldst", 
                                                              "d", 
                                                              "dare", 
                                                              "dared", 
                                                              "daren't", 
                                                              "dares", 
                                                              "daring", 
                                                              "despite", 
                                                              "did", 
                                                              "didn't", 
                                                              "different", 
                                                              "directly", 
                                                              "do", 
                                                              "does", 
                                                              "doesn't", 
                                                              "doing", 
                                                              "done", 
                                                              "don't", 
                                                              "dost", 
                                                              "doth", 
                                                              "down", 
                                                              "during", 
                                                              "durst", 
                                                              "e", 
                                                              "each", 
                                                              "early", 
                                                              "either", 
                                                              "em", 
                                                              "english", 
                                                              "enough", 
                                                              "ere", 
                                                              "even", 
                                                              "ever", 
                                                              "every", 
                                                              "everybody", 
                                                              "everyone", 
                                                              "everything", 
                                                              "except", 
                                                              "excepting", 
                                                              "f", 
                                                              "failing", 
                                                              "far", 
                                                              "few", 
                                                              "first", 
                                                              "five", 
                                                              "following", 
                                                              "for", 
                                                              "four", 
                                                              "from", 
                                                              "g", 
                                                              "gonna", 
                                                              "gotta", 
                                                              "h", 
                                                              "had", 
                                                              "hadn't", 
                                                              "hard", 
                                                              "has", 
                                                              "hasn't", 
                                                              "hast", 
                                                              "hath", 
                                                              "have", 
                                                              "haven't", 
                                                              "having", 
                                                              "he", 
                                                              "he'd", 
                                                              "he'll", 
                                                              "her", 
                                                              "here", 
                                                              "here's", 
                                                              "hers", 
                                                              "herself", 
                                                              "he's", 
                                                              "high", 
                                                              "him", 
                                                              "himself", 
                                                              "his", 
                                                              "home", 
                                                              "how", 
                                                              "howbeit", 
                                                              "however", 
                                                              "how's", 
                                                              "i", 
                                                              "id", 
                                                              "if", 
                                                              "ill", 
                                                              "i'm", 
                                                              "immediately", 
                                                              "important", 
                                                              "in", 
                                                              "inside", 
                                                              "instantly", 
                                                              "into", 
                                                              "is", 
                                                              "isn't", 
                                                              "it", 
                                                              "it'll", 
                                                              "it's", 
                                                              "its", 
                                                              "itself", 
                                                              "i've", 
                                                              "j", 
                                                              "just", 
                                                              "k", 
                                                              "l", 
                                                              "large", 
                                                              "last", 
                                                              "later", 
                                                              "least", 
                                                              "left", 
                                                              "less", 
                                                              "lest", 
                                                              "let's", 
                                                              "like", 
                                                              "likewise", 
                                                              "little", 
                                                              "living", 
                                                              "long", 
                                                              "m", 
                                                              "many", 
                                                              "may", 
                                                              "mayn't", 
                                                              "me", 
                                                              "mid", 
                                                              "midst", 
                                                              "might", 
                                                              "mightn't", 
                                                              "mine", 
                                                              "minus", 
                                                              "more", 
                                                              "most", 
                                                              "much", 
                                                              "must", 
                                                              "mustn't", 
                                                              "my", 
                                                              "myself", 
                                                              "n", 
                                                              "near", 
                                                              "'neath", 
                                                              "need", 
                                                              "needed", 
                                                              "needing", 
                                                              "needn't", 
                                                              "needs", 
                                                              "neither", 
                                                              "never", 
                                                              "nevertheless", 
                                                              "new", 
                                                              "next", 
                                                              "nigh", 
                                                              "nigher", 
                                                              "nighest", 
                                                              "nisi", 
                                                              "no", 
                                                              "no-one", 
                                                              "nobody", 
                                                              "none", 
                                                              "nor", 
                                                              "not", 
                                                              "nothing", 
                                                              "notwithstanding", 
                                                              "now", 
                                                              "o", 
                                                              "o'er", 
                                                              "of", 
                                                              "off", 
                                                              "often", 
                                                              "on", 
                                                              "once", 
                                                              "one", 
                                                              "oneself", 
                                                              "only", 
                                                              "onto", 
                                                              "open", 
                                                              "or", 
                                                              "other", 
                                                              "otherwise", 
                                                              "ought", 
                                                              "oughtn't", 
                                                              "our", 
                                                              "ours", 
                                                              "ourselves", 
                                                              "out", 
                                                              "outside", 
                                                              "over", 
                                                              "own", 
                                                              "p", 
                                                              "past", 
                                                              "pending", 
                                                              "per", 
                                                              "perhaps", 
                                                              "plus", 
                                                              "possible", 
                                                              "present", 
                                                              "probably", 
                                                              "provided", 
                                                              "providing", 
                                                              "public", 
                                                              "q", 
                                                              "qua", 
                                                              "quite", 
                                                              "r", 
                                                              "rather", 
                                                              "re", 
                                                              "real", 
                                                              "really", 
                                                              "respecting", 
                                                              "right", 
                                                              "round", 
                                                              "s", 
                                                              "same", 
                                                              "sans", 
                                                              "save", 
                                                              "saving", 
                                                              "second", 
                                                              "several", 
                                                              "shall", 
                                                              "shalt", 
                                                              "shan't", 
                                                              "she", 
                                                              "shed", 
                                                              "shell", 
                                                              "she's", 
                                                              "short", 
                                                              "should", 
                                                              "shouldn't", 
                                                              "since", 
                                                              "six", 
                                                              "small", 
                                                              "so", 
                                                              "some", 
                                                              "somebody", 
                                                              "someone", 
                                                              "something", 
                                                              "sometimes", 
                                                              "soon", 
                                                              "special", 
                                                              "still", 
                                                              "such", 
                                                              "summat", 
                                                              "supposing", 
                                                              "sure", 
                                                              "t", 
                                                              "than", 
                                                              "that", 
                                                              "that'd", 
                                                              "that'll", 
                                                              "that's", 
                                                              "the", 
                                                              "thee", 
                                                              "their", 
                                                              "theirs", 
                                                              "their's", 
                                                              "them", 
                                                              "themselves", 
                                                              "then", 
                                                              "there", 
                                                              "there's", 
                                                              "these", 
                                                              "they", 
                                                              "they'd", 
                                                              "they'll", 
                                                              "they're", 
                                                              "they've", 
                                                              "thine", 
                                                              "this", 
                                                              "tho", 
                                                              "those", 
                                                              "thou", 
                                                              "though", 
                                                              "three", 
                                                              "thro'", 
                                                              "through", 
                                                              "throughout", 
                                                              "thru", 
                                                              "thyself", 
                                                              "till", 
                                                              "to", 
                                                              "today", 
                                                              "together", 
                                                              "too", 
                                                              "touching", 
                                                              "toward", 
                                                              "towards", 
                                                              "true", 
                                                              "'twas", 
                                                              "'tween", 
                                                              "'twere", 
                                                              "'twill", 
                                                              "'twixt", 
                                                              "two", 
                                                              "'twould", 
                                                              "u", 
                                                              "under", 
                                                              "underneath", 
                                                              "unless", 
                                                              "unlike", 
                                                              "until", 
                                                              "unto", 
                                                              "up", 
                                                              "upon", 
                                                              "us", 
                                                              "used", 
                                                              "usually", 
                                                              "v", 
                                                              "versus", 
                                                              "very", 
                                                              "via", 
                                                              "vice", 
                                                              "vis-a-vis", 
                                                              "w", 
                                                              "wanna", 
                                                              "wanting", 
                                                              "was", 
                                                              "wasn't", 
                                                              "way", 
                                                              "we", 
                                                              "we'd", 
                                                              "well", 
                                                              "were", 
                                                              "weren't", 
                                                              "wert", 
                                                              "we've", 
                                                              "what", 
                                                              "whatever", 
                                                              "what'll", 
                                                              "what's", 
                                                              "when", 
                                                              "whencesoever", 
                                                              "whenever", 
                                                              "when's", 
                                                              "whereas", 
                                                              "where's", 
                                                              "whether", 
                                                              "which", 
                                                              "whichever", 
                                                              "whichsoever", 
                                                              "while", 
                                                              "whilst", 
                                                              "who", 
                                                              "who'd", 
                                                              "whoever", 
                                                              "whole", 
                                                              "who'll", 
                                                              "whom", 
                                                              "whore", 
                                                              "who's", 
                                                              "whose", 
                                                              "whoso", 
                                                              "whosoever", 
                                                              "will", 
                                                              "with", 
                                                              "within", 
                                                              "without", 
                                                              "wont", 
                                                              "would", 
                                                              "wouldn't", 
                                                              "wouldst", 
                                                              "x", 
                                                              "y", 
                                                              "ye", 
                                                              "yet", 
                                                              "you", 
                                                              "you'd", 
                                                              "you'll", 
                                                              "your", 
                                                              "you're", 
                                                              "yours", 
                                                              "yourself", 
                                                              "yourselves", 
                                                              "you've", 
                                                              "z", 
        } ;

		private static Hashtable _stopwords=null;

		public static object AddElement(IDictionary collection,Object key, object newValue)
		{
			object element = collection[key];
			collection[key] = newValue;
			return element;
		}

        public static bool IsStopword(string str)
        {

            //int index=Array.BinarySearch(stopWordsList, str)
            return _stopwords.ContainsKey(str);
        }
	

		public StopWordsHandler()
		{
			if (_stopwords == null)
			{
				_stopwords = new Hashtable();
				double dummy = 0;
				foreach (string word in stopWordsList)
				{
					AddElement(_stopwords, word, dummy);
				}
			}
		}
	}
    }

TFIDF.cs:

C#

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace WindowsFormsApplication1.appcode
{
    class TFIDF
    {
        private string[] _docs;
		private string[][] _ngramDoc;
		private int _numDocs=0;
		private int _numTerms=0;
		private ArrayList _terms;
		private int[][] _termFreq;
		private float[][] _termWeight;
		private int[] _maxTermFreq;
		private int[] _docFreq;


		public class TermVector
		{		
			public static float ComputeCosineSimilarity(float[] vector1, float[] vector2)
			{
				if (vector1.Length != vector2.Length)				
					throw new Exception("DIFER LENGTH");
				

				float denom=(VectorLength(vector1) * VectorLength(vector2));
				if (denom == 0F)				
					return 0F;				
				else				
					return (InnerProduct(vector1, vector2) / denom);
				
			}

			public static float InnerProduct(float[] vector1, float[] vector2)
			{
			
				if (vector1.Length != vector2.Length)
					throw new Exception("DIFFER LENGTH ARE NOT ALLOWED");
				
			
				float result=0F;
				for (int i=0; i < vector1.Length; i++)				
					result += vector1[i] * vector2[i];
				
				return result;
			}
		
			public static float VectorLength(float[] vector)
			{			
				float sum=0.0F;
				for (int i=0; i < vector.Length; i++)				
					sum=sum + (vector[i] * vector[i]);
						
				return (float)Math.Sqrt(sum);
			}

		}

		private IDictionary _wordsIndex=new Hashtable() ;

		public TFIDF(string[] documents)
		{
			_docs=documents;
			_numDocs=documents.Length ;
			MyInit();
		}

		private void GeneratNgramText()
		{
			
		}

		private ArrayList GenerateTerms(string[] docs)
		{
			ArrayList uniques=new ArrayList() ;
			_ngramDoc=new string[_numDocs][] ;
			for (int i=0; i < docs.Length ; i++)
			{
				
                Tokeniser tokenizer=new Tokeniser() ;
				string[] words=tokenizer.Partition(docs[i]);			

				for (int j=0; j < words.Length ; j++)
					if (!uniques.Contains(words[j]) )				
						uniques.Add(words[j]) ;
								
			}
			return uniques;
		}
		


		private static object AddElement(IDictionary collection, object key, object newValue)
		{
			object element=collection[key];
			collection[key]=newValue;
			return element;
		}

		private int GetTermIndex(string term)
		{
			object index=_wordsIndex[term];
			if (index == null) return -1;
			return (int) index;
		}

		private void MyInit()
		{
			_terms=GenerateTerms (_docs );
			_numTerms=_terms.Count ;

			_maxTermFreq=new int[_numDocs] ;
			_docFreq=new int[_numTerms] ;
			_termFreq =new int[_numTerms][] ;
			_termWeight=new float[_numTerms][] ;

			for(int i=0; i < _terms.Count ; i++)			
			{
				_termWeight[i]=new float[_numDocs] ;
				_termFreq[i]=new int[_numDocs] ;

				AddElement(_wordsIndex, _terms[i], i);			
			}
			
			GenerateTermFrequency ();
			GenerateTermWeight();			
				
		}
		
		private float Log(float num)
		{
			return (float) Math.Log(num) ;//log2
		}

		private void GenerateTermFrequency()
		{
			for(int i=0; i < _numDocs  ; i++)
			{								
				string curDoc=_docs[i];
				IDictionary freq=GetWordFrequency(curDoc);
				IDictionaryEnumerator enums=freq.GetEnumerator() ;
				_maxTermFreq[i]=int.MinValue ;
				while (enums.MoveNext())
				{
					string word=(string)enums.Key;
					int wordFreq=(int)enums.Value ;
					int termIndex=GetTermIndex(word);

					_termFreq [termIndex][i]=wordFreq;
					_docFreq[termIndex] ++;

					if (wordFreq > _maxTermFreq[i]) _maxTermFreq[i]=wordFreq;					
				}
			}
		}
		

		private void GenerateTermWeight()
		{			
			for(int i=0; i < _numTerms   ; i++)
			{
				for(int j=0; j < _numDocs ; j++)				
					_termWeight[i][j]=ComputeTermWeight (i, j);				
			}
		}

		private float GetTermFrequency(int term, int doc)
		{			
			int freq=_termFreq [term][doc];
			int maxfreq=_maxTermFreq[doc];			
			
			return ( (float) freq/(float)maxfreq );
		}

		private float GetInverseDocumentFrequency(int term)
		{
			int df=_docFreq[term];
			return Log((float) (_numDocs) / (float) df );
		}

		private float ComputeTermWeight(int term, int doc)
		{
			float tf=GetTermFrequency (term, doc);
			float idf=GetInverseDocumentFrequency(term);
			return tf * idf;
		}
		
		private  float[] GetTermVector(int doc)
		{
			float[] w=new float[_numTerms] ;
			for (int i=0; i < _numTerms; i++)											
				w[i]=_termWeight[i][doc];
			
				
			return w;
		}

		public float GetSimilarity(int doc_i, int doc_j)
		{
			float[] vector1=GetTermVector (doc_i);
			float[] vector2=GetTermVector (doc_j);

			return TermVector.ComputeCosineSimilarity(vector1, vector2) ;

		}
		
		private IDictionary GetWordFrequency(string input)
		{
			//string convertedInput=input.ToLower() ;
					
			Tokeniser tokenizer=new Tokeniser() ;
			String[] words=tokenizer.Partition(input);			
			Array.Sort(words);
			
			String[] distinctWords=GetDistinctWords(words);
						
			IDictionary result=new Hashtable();
			for (int i=0; i < distinctWords.Length; i++)
			{
				object tmp;
				tmp=CountWords(distinctWords[i], words);
				result[distinctWords[i]]=tmp;
				
			}
			
			return result;
		}				
				
		private string[] GetDistinctWords(String[] input)
		{				
			if (input == null)			
				return new string[0];			
			else
			{
				ArrayList list=new ArrayList() ;
				
				for (int i=0; i < input.Length; i++)
					if (!list.Contains(input[i])) // N-GRAM SIMILARITY?				
						list.Add(input[i]);
				
				return Tokeniser.ArrayListToArray(list) ;
			}
		}
		

		
		private int CountWords(string word, string[] words)
		{
			int itemIdx=Array.BinarySearch(words, word);
			
			if (itemIdx > 0)			
				while (itemIdx > 0 && words[itemIdx].Equals(word))				
					itemIdx--;				
						
			int count=0;
			while (itemIdx < words.Length && itemIdx >= 0)
			{
				if (words[itemIdx].Equals(word)) count++;				
				
				itemIdx++;
				if (itemIdx < words.Length)				
					if (!words[itemIdx].Equals(word)) break;					
				
			}
			
			return count;
		}				
	}
    }

Tokeniser.cs:

C#

using System;
using System.Collections.Generic;
using System.Linq;
using System.Collections;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;

namespace WindowsFormsApplication1.appcode
{
    class Tokeniser
    {
        public static string[] ArrayListToArray(ArrayList arraylist)
        {
            string[] array = new string[arraylist.Count];
            for (int i = 0; i < arraylist.Count; i++) array[i] = (string)arraylist[i];
            return array;
        }

        public string[] Partition(string input)
        {
            Regex r = new Regex("([ \\t{}():;. \n])");
            //input = input.ToLower();

            String[] tokens = r.Split(input);

            ArrayList filter = new ArrayList();

            for (int i = 0; i < tokens.Length; i++)
            {
                MatchCollection mc = r.Matches(tokens[i]);
                if (mc.Count <= 0 && tokens[i].Trim().Length > 0
                    && !StopWordsHandler.IsStopword(tokens[i]))
                    filter.Add(tokens[i]);


            }

            return ArrayListToArray(filter);
        }


        public Tokeniser()
        {
        }
    }
}

button3 is compare functionality.in this scope i have to write similarity logic in terms of percentage.

oncle please check the code for similarty b/w the two pdf files.if any probelm please intimate.please help me.

thank u.

Posted 3-Apr-15 23:45pm

Krishna Veni

Updated 4-Apr-15 0:04am

v2

Add a Solution

Comments

Mehdi Gholam 4-Apr-15 6:01am

"Not working properly" is not helpful information.

Krishna Veni 4-Apr-15 6:10am

please check the what is the probelm

Krishna Veni 4-Apr-15 7:17am

not work properly means i got 0% b/w any near Duplicate files but i want correct percentage my point of view logic is correct but i don't know how to 0% is returned in button 3.please help me.Finally i want similarity b/w two pdf files interms of percentage.please help me.thank u.

Add your solution here

Treat my content as plain text, not as HTML

Preview 0

…

Existing Members

Sign in to your account

...or Join us

Download, Vote, Comment, Publish.

Your Email
Password
Forgot your password?

Your Email
This email is in use. Do you need your password?
Optional Password

I have read and agree to the Terms of Service and Privacy Policy
Please subscribe me to the CodeProject newsletters

When answering a question please:

Read the question carefully.
Understand that English isn't everyone's first language so be lenient of bad spelling and grammar.
If a question is poorly phrased then either ask for clarification, ignore it, or edit the question and fix the problem. Insults are not welcome.
Don't tell someone to read the manual. Chances are they have and don't get it. Provide an answer or move on to the next question.

Let's work to help developers, not make them feel stupid.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)