Click here to Skip to main content
15,895,011 members
Please Sign up or sign in to vote.
1.00/5 (1 vote)
See more:
I have Develop application for near duplicate detection in c#.This application work for strings only but it's not working for pdf files.may be i think GetSimilarity method can not work properly but error not be raised.
my application code like as:

C#
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using iTextSharp.text;
using System.Threading;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.Text.RegularExpressions;
using WindowsFormsApplication1.appcode;

namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        string filename;
        FileInfo[] data1;
        FileInfo[] data2;
        string path;
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            OpenFileDialog openFileDialog = new OpenFileDialog();
            openFileDialog.CheckFileExists = true;
            openFileDialog.AddExtension = true;
            openFileDialog.Filter = "PDF files (*.pdf)|*.pdf";
            DialogResult result = openFileDialog.ShowDialog();
            if (result == DialogResult.OK)
            {
                filename = Path.GetFileName(openFileDialog.FileName);
                path = Path.GetDirectoryName(openFileDialog.FileName);
                textBox1.Text = path + "\\" + filename;
               
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {

            OpenFileDialog openFileDialog = new OpenFileDialog();
            openFileDialog.CheckFileExists = true;
            openFileDialog.AddExtension = true;
            openFileDialog.Filter = "PDF files (*.pdf)|*.pdf";
            DialogResult result = openFileDialog.ShowDialog();
            if (result == DialogResult.OK)
            {
                filename = Path.GetFileName(openFileDialog.FileName);
                path = Path.GetDirectoryName(openFileDialog.FileName);
                textBox2.Text = path + "\\" + filename;

            }
        }

        public static string ExtractTextFromPdf(string filename)
        {
            using (PdfReader r = new PdfReader(filename))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= r.NumberOfPages; i++)
                {
                    text.Append(PdfTextExtractor.GetTextFromPage(r, i));
                }

                string first = text.ToString();
                return first;
            }
        }
        public static string Extract(string filename)
        {
            using (PdfReader r = new PdfReader(filename))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= r.NumberOfPages; i++)
                {
                    text.Append(PdfTextExtractor.GetTextFromPage(r, i));
                }

                string second = text.ToString();
                return second;
            }
        }

       private void button3_Click(object sender, EventArgs e)
        {
            StopWordsHandler stopword = new StopWordsHandler();
            string s = ExtractTextFromPdf(textBox1.Text);
            string s1 = Extract(textBox2.Text);
            string[] doc = new string[2]{s,s1 };
            TFIDF tfidf = new TFIDF(doc);
            float fl = tfidf.GetSimilarity(0,1);
            var sformatted = string.Format("Value: {0:P2}.", fl);
           
        }


StopWordsHandler.cs:
C#
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace WindowsFormsApplication1.appcode
{
    class StopWordsHandler
    {
        public static string[] stopWordsList=new string[] {

															  "a", 																													  
															  "about", 
															  "above", 
															  "across", 
															  "afore", 
															  "aforesaid",
                                                              "after", 
                                                              "again", 
                                                              "against", 
                                                              "agin", 
                                                              "ago", 
                                                              "aint", 
                                                              "albeit", 
                                                              "all", 
                                                              "almost", 
                                                              "alone", 
                                                              "along", 
                                                              "alongside", 
                                                              "already", 
                                                              "also", 
                                                              "although", 
                                                              "always", 
                                                              "am", 
                                                              "american", 
                                                              "amid", 
                                                              "amidst", 
                                                              "among", 
                                                              "amongst", 
                                                              "an", 
                                                              "and", 
                                                              "anent", 
                                                              "another", 
                                                              "any", 
                                                              "anybody", 
                                                              "anyone", 
                                                              "anything", 
                                                              "are", 
                                                              "aren't", 
                                                              "around", 
                                                              "as", 
                                                              "aslant", 
                                                              "astride", 
                                                              "at", 
                                                              "athwart", 
                                                              "away", 
                                                              "b", 
                                                              "back", 
                                                              "bar", 
                                                              "barring", 
                                                              "be", 
                                                              "because", 
                                                              "been", 
                                                              "before", 
                                                              "behind", 
                                                              "being", 
                                                              "below", 
                                                              "beneath", 
                                                              "beside", 
                                                              "besides", 
                                                              "best", 
                                                              "better", 
                                                              "between", 
                                                              "betwixt", 
                                                              "beyond", 
                                                              "both", 
                                                              "but", 
                                                              "by", 
                                                              "c", 
                                                              "can", 
                                                              "cannot", 
                                                              "can't", 
                                                              "certain", 
                                                              "circa", 
                                                              "close", 
                                                              "concerning", 
                                                              "considering", 
                                                              "cos", 
                                                              "could", 
                                                              "couldn't", 
                                                              "couldst", 
                                                              "d", 
                                                              "dare", 
                                                              "dared", 
                                                              "daren't", 
                                                              "dares", 
                                                              "daring", 
                                                              "despite", 
                                                              "did", 
                                                              "didn't", 
                                                              "different", 
                                                              "directly", 
                                                              "do", 
                                                              "does", 
                                                              "doesn't", 
                                                              "doing", 
                                                              "done", 
                                                              "don't", 
                                                              "dost", 
                                                              "doth", 
                                                              "down", 
                                                              "during", 
                                                              "durst", 
                                                              "e", 
                                                              "each", 
                                                              "early", 
                                                              "either", 
                                                              "em", 
                                                              "english", 
                                                              "enough", 
                                                              "ere", 
                                                              "even", 
                                                              "ever", 
                                                              "every", 
                                                              "everybody", 
                                                              "everyone", 
                                                              "everything", 
                                                              "except", 
                                                              "excepting", 
                                                              "f", 
                                                              "failing", 
                                                              "far", 
                                                              "few", 
                                                              "first", 
                                                              "five", 
                                                              "following", 
                                                              "for", 
                                                              "four", 
                                                              "from", 
                                                              "g", 
                                                              "gonna", 
                                                              "gotta", 
                                                              "h", 
                                                              "had", 
                                                              "hadn't", 
                                                              "hard", 
                                                              "has", 
                                                              "hasn't", 
                                                              "hast", 
                                                              "hath", 
                                                              "have", 
                                                              "haven't", 
                                                              "having", 
                                                              "he", 
                                                              "he'd", 
                                                              "he'll", 
                                                              "her", 
                                                              "here", 
                                                              "here's", 
                                                              "hers", 
                                                              "herself", 
                                                              "he's", 
                                                              "high", 
                                                              "him", 
                                                              "himself", 
                                                              "his", 
                                                              "home", 
                                                              "how", 
                                                              "howbeit", 
                                                              "however", 
                                                              "how's", 
                                                              "i", 
                                                              "id", 
                                                              "if", 
                                                              "ill", 
                                                              "i'm", 
                                                              "immediately", 
                                                              "important", 
                                                              "in", 
                                                              "inside", 
                                                              "instantly", 
                                                              "into", 
                                                              "is", 
                                                              "isn't", 
                                                              "it", 
                                                              "it'll", 
                                                              "it's", 
                                                              "its", 
                                                              "itself", 
                                                              "i've", 
                                                              "j", 
                                                              "just", 
                                                              "k", 
                                                              "l", 
                                                              "large", 
                                                              "last", 
                                                              "later", 
                                                              "least", 
                                                              "left", 
                                                              "less", 
                                                              "lest", 
                                                              "let's", 
                                                              "like", 
                                                              "likewise", 
                                                              "little", 
                                                              "living", 
                                                              "long", 
                                                              "m", 
                                                              "many", 
                                                              "may", 
                                                              "mayn't", 
                                                              "me", 
                                                              "mid", 
                                                              "midst", 
                                                              "might", 
                                                              "mightn't", 
                                                              "mine", 
                                                              "minus", 
                                                              "more", 
                                                              "most", 
                                                              "much", 
                                                              "must", 
                                                              "mustn't", 
                                                              "my", 
                                                              "myself", 
                                                              "n", 
                                                              "near", 
                                                              "'neath", 
                                                              "need", 
                                                              "needed", 
                                                              "needing", 
                                                              "needn't", 
                                                              "needs", 
                                                              "neither", 
                                                              "never", 
                                                              "nevertheless", 
                                                              "new", 
                                                              "next", 
                                                              "nigh", 
                                                              "nigher", 
                                                              "nighest", 
                                                              "nisi", 
                                                              "no", 
                                                              "no-one", 
                                                              "nobody", 
                                                              "none", 
                                                              "nor", 
                                                              "not", 
                                                              "nothing", 
                                                              "notwithstanding", 
                                                              "now", 
                                                              "o", 
                                                              "o'er", 
                                                              "of", 
                                                              "off", 
                                                              "often", 
                                                              "on", 
                                                              "once", 
                                                              "one", 
                                                              "oneself", 
                                                              "only", 
                                                              "onto", 
                                                              "open", 
                                                              "or", 
                                                              "other", 
                                                              "otherwise", 
                                                              "ought", 
                                                              "oughtn't", 
                                                              "our", 
                                                              "ours", 
                                                              "ourselves", 
                                                              "out", 
                                                              "outside", 
                                                              "over", 
                                                              "own", 
                                                              "p", 
                                                              "past", 
                                                              "pending", 
                                                              "per", 
                                                              "perhaps", 
                                                              "plus", 
                                                              "possible", 
                                                              "present", 
                                                              "probably", 
                                                              "provided", 
                                                              "providing", 
                                                              "public", 
                                                              "q", 
                                                              "qua", 
                                                              "quite", 
                                                              "r", 
                                                              "rather", 
                                                              "re", 
                                                              "real", 
                                                              "really", 
                                                              "respecting", 
                                                              "right", 
                                                              "round", 
                                                              "s", 
                                                              "same", 
                                                              "sans", 
                                                              "save", 
                                                              "saving", 
                                                              "second", 
                                                              "several", 
                                                              "shall", 
                                                              "shalt", 
                                                              "shan't", 
                                                              "she", 
                                                              "shed", 
                                                              "shell", 
                                                              "she's", 
                                                              "short", 
                                                              "should", 
                                                              "shouldn't", 
                                                              "since", 
                                                              "six", 
                                                              "small", 
                                                              "so", 
                                                              "some", 
                                                              "somebody", 
                                                              "someone", 
                                                              "something", 
                                                              "sometimes", 
                                                              "soon", 
                                                              "special", 
                                                              "still", 
                                                              "such", 
                                                              "summat", 
                                                              "supposing", 
                                                              "sure", 
                                                              "t", 
                                                              "than", 
                                                              "that", 
                                                              "that'd", 
                                                              "that'll", 
                                                              "that's", 
                                                              "the", 
                                                              "thee", 
                                                              "their", 
                                                              "theirs", 
                                                              "their's", 
                                                              "them", 
                                                              "themselves", 
                                                              "then", 
                                                              "there", 
                                                              "there's", 
                                                              "these", 
                                                              "they", 
                                                              "they'd", 
                                                              "they'll", 
                                                              "they're", 
                                                              "they've", 
                                                              "thine", 
                                                              "this", 
                                                              "tho", 
                                                              "those", 
                                                              "thou", 
                                                              "though", 
                                                              "three", 
                                                              "thro'", 
                                                              "through", 
                                                              "throughout", 
                                                              "thru", 
                                                              "thyself", 
                                                              "till", 
                                                              "to", 
                                                              "today", 
                                                              "together", 
                                                              "too", 
                                                              "touching", 
                                                              "toward", 
                                                              "towards", 
                                                              "true", 
                                                              "'twas", 
                                                              "'tween", 
                                                              "'twere", 
                                                              "'twill", 
                                                              "'twixt", 
                                                              "two", 
                                                              "'twould", 
                                                              "u", 
                                                              "under", 
                                                              "underneath", 
                                                              "unless", 
                                                              "unlike", 
                                                              "until", 
                                                              "unto", 
                                                              "up", 
                                                              "upon", 
                                                              "us", 
                                                              "used", 
                                                              "usually", 
                                                              "v", 
                                                              "versus", 
                                                              "very", 
                                                              "via", 
                                                              "vice", 
                                                              "vis-a-vis", 
                                                              "w", 
                                                              "wanna", 
                                                              "wanting", 
                                                              "was", 
                                                              "wasn't", 
                                                              "way", 
                                                              "we", 
                                                              "we'd", 
                                                              "well", 
                                                              "were", 
                                                              "weren't", 
                                                              "wert", 
                                                              "we've", 
                                                              "what", 
                                                              "whatever", 
                                                              "what'll", 
                                                              "what's", 
                                                              "when", 
                                                              "whencesoever", 
                                                              "whenever", 
                                                              "when's", 
                                                              "whereas", 
                                                              "where's", 
                                                              "whether", 
                                                              "which", 
                                                              "whichever", 
                                                              "whichsoever", 
                                                              "while", 
                                                              "whilst", 
                                                              "who", 
                                                              "who'd", 
                                                              "whoever", 
                                                              "whole", 
                                                              "who'll", 
                                                              "whom", 
                                                              "whore", 
                                                              "who's", 
                                                              "whose", 
                                                              "whoso", 
                                                              "whosoever", 
                                                              "will", 
                                                              "with", 
                                                              "within", 
                                                              "without", 
                                                              "wont", 
                                                              "would", 
                                                              "wouldn't", 
                                                              "wouldst", 
                                                              "x", 
                                                              "y", 
                                                              "ye", 
                                                              "yet", 
                                                              "you", 
                                                              "you'd", 
                                                              "you'll", 
                                                              "your", 
                                                              "you're", 
                                                              "yours", 
                                                              "yourself", 
                                                              "yourselves", 
                                                              "you've", 
                                                              "z", 
        } ;

		private static Hashtable _stopwords=null;

		public static object AddElement(IDictionary collection,Object key, object newValue)
		{
			object element = collection[key];
			collection[key] = newValue;
			return element;
		}

        public static bool IsStopword(string str)
        {

            //int index=Array.BinarySearch(stopWordsList, str)
            return _stopwords.ContainsKey(str);
        }
	

		public StopWordsHandler()
		{
			if (_stopwords == null)
			{
				_stopwords = new Hashtable();
				double dummy = 0;
				foreach (string word in stopWordsList)
				{
					AddElement(_stopwords, word, dummy);
				}
			}
		}
	}
    }

TFIDF.cs:

C#
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace WindowsFormsApplication1.appcode
{
    class TFIDF
    {
        private string[] _docs;
		private string[][] _ngramDoc;
		private int _numDocs=0;
		private int _numTerms=0;
		private ArrayList _terms;
		private int[][] _termFreq;
		private float[][] _termWeight;
		private int[] _maxTermFreq;
		private int[] _docFreq;


		public class TermVector
		{		
			public static float ComputeCosineSimilarity(float[] vector1, float[] vector2)
			{
				if (vector1.Length != vector2.Length)				
					throw new Exception("DIFER LENGTH");
				

				float denom=(VectorLength(vector1) * VectorLength(vector2));
				if (denom == 0F)				
					return 0F;				
				else				
					return (InnerProduct(vector1, vector2) / denom);
				
			}

			public static float InnerProduct(float[] vector1, float[] vector2)
			{
			
				if (vector1.Length != vector2.Length)
					throw new Exception("DIFFER LENGTH ARE NOT ALLOWED");
				
			
				float result=0F;
				for (int i=0; i < vector1.Length; i++)				
					result += vector1[i] * vector2[i];
				
				return result;
			}
		
			public static float VectorLength(float[] vector)
			{			
				float sum=0.0F;
				for (int i=0; i < vector.Length; i++)				
					sum=sum + (vector[i] * vector[i]);
						
				return (float)Math.Sqrt(sum);
			}

		}

		private IDictionary _wordsIndex=new Hashtable() ;

		public TFIDF(string[] documents)
		{
			_docs=documents;
			_numDocs=documents.Length ;
			MyInit();
		}

		private void GeneratNgramText()
		{
			
		}

		private ArrayList GenerateTerms(string[] docs)
		{
			ArrayList uniques=new ArrayList() ;
			_ngramDoc=new string[_numDocs][] ;
			for (int i=0; i < docs.Length ; i++)
			{
				
                Tokeniser tokenizer=new Tokeniser() ;
				string[] words=tokenizer.Partition(docs[i]);			

				for (int j=0; j < words.Length ; j++)
					if (!uniques.Contains(words[j]) )				
						uniques.Add(words[j]) ;
								
			}
			return uniques;
		}
		


		private static object AddElement(IDictionary collection, object key, object newValue)
		{
			object element=collection[key];
			collection[key]=newValue;
			return element;
		}

		private int GetTermIndex(string term)
		{
			object index=_wordsIndex[term];
			if (index == null) return -1;
			return (int) index;
		}

		private void MyInit()
		{
			_terms=GenerateTerms (_docs );
			_numTerms=_terms.Count ;

			_maxTermFreq=new int[_numDocs] ;
			_docFreq=new int[_numTerms] ;
			_termFreq =new int[_numTerms][] ;
			_termWeight=new float[_numTerms][] ;

			for(int i=0; i < _terms.Count ; i++)			
			{
				_termWeight[i]=new float[_numDocs] ;
				_termFreq[i]=new int[_numDocs] ;

				AddElement(_wordsIndex, _terms[i], i);			
			}
			
			GenerateTermFrequency ();
			GenerateTermWeight();			
				
		}
		
		private float Log(float num)
		{
			return (float) Math.Log(num) ;//log2
		}

		private void GenerateTermFrequency()
		{
			for(int i=0; i < _numDocs  ; i++)
			{								
				string curDoc=_docs[i];
				IDictionary freq=GetWordFrequency(curDoc);
				IDictionaryEnumerator enums=freq.GetEnumerator() ;
				_maxTermFreq[i]=int.MinValue ;
				while (enums.MoveNext())
				{
					string word=(string)enums.Key;
					int wordFreq=(int)enums.Value ;
					int termIndex=GetTermIndex(word);

					_termFreq [termIndex][i]=wordFreq;
					_docFreq[termIndex] ++;

					if (wordFreq > _maxTermFreq[i]) _maxTermFreq[i]=wordFreq;					
				}
			}
		}
		

		private void GenerateTermWeight()
		{			
			for(int i=0; i < _numTerms   ; i++)
			{
				for(int j=0; j < _numDocs ; j++)				
					_termWeight[i][j]=ComputeTermWeight (i, j);				
			}
		}

		private float GetTermFrequency(int term, int doc)
		{			
			int freq=_termFreq [term][doc];
			int maxfreq=_maxTermFreq[doc];			
			
			return ( (float) freq/(float)maxfreq );
		}

		private float GetInverseDocumentFrequency(int term)
		{
			int df=_docFreq[term];
			return Log((float) (_numDocs) / (float) df );
		}

		private float ComputeTermWeight(int term, int doc)
		{
			float tf=GetTermFrequency (term, doc);
			float idf=GetInverseDocumentFrequency(term);
			return tf * idf;
		}
		
		private  float[] GetTermVector(int doc)
		{
			float[] w=new float[_numTerms] ;
			for (int i=0; i < _numTerms; i++)											
				w[i]=_termWeight[i][doc];
			
				
			return w;
		}

		public float GetSimilarity(int doc_i, int doc_j)
		{
			float[] vector1=GetTermVector (doc_i);
			float[] vector2=GetTermVector (doc_j);

			return TermVector.ComputeCosineSimilarity(vector1, vector2) ;

		}
		
		private IDictionary GetWordFrequency(string input)
		{
			//string convertedInput=input.ToLower() ;
					
			Tokeniser tokenizer=new Tokeniser() ;
			String[] words=tokenizer.Partition(input);			
			Array.Sort(words);
			
			String[] distinctWords=GetDistinctWords(words);
						
			IDictionary result=new Hashtable();
			for (int i=0; i < distinctWords.Length; i++)
			{
				object tmp;
				tmp=CountWords(distinctWords[i], words);
				result[distinctWords[i]]=tmp;
				
			}
			
			return result;
		}				
				
		private string[] GetDistinctWords(String[] input)
		{				
			if (input == null)			
				return new string[0];			
			else
			{
				ArrayList list=new ArrayList() ;
				
				for (int i=0; i < input.Length; i++)
					if (!list.Contains(input[i])) // N-GRAM SIMILARITY?				
						list.Add(input[i]);
				
				return Tokeniser.ArrayListToArray(list) ;
			}
		}
		

		
		private int CountWords(string word, string[] words)
		{
			int itemIdx=Array.BinarySearch(words, word);
			
			if (itemIdx > 0)			
				while (itemIdx > 0 && words[itemIdx].Equals(word))				
					itemIdx--;				
						
			int count=0;
			while (itemIdx < words.Length && itemIdx >= 0)
			{
				if (words[itemIdx].Equals(word)) count++;				
				
				itemIdx++;
				if (itemIdx < words.Length)				
					if (!words[itemIdx].Equals(word)) break;					
				
			}
			
			return count;
		}				
	}
    }


Tokeniser.cs:

C#
using System;
using System.Collections.Generic;
using System.Linq;
using System.Collections;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;

namespace WindowsFormsApplication1.appcode
{
    class Tokeniser
    {
        public static string[] ArrayListToArray(ArrayList arraylist)
        {
            string[] array = new string[arraylist.Count];
            for (int i = 0; i < arraylist.Count; i++) array[i] = (string)arraylist[i];
            return array;
        }

        public string[] Partition(string input)
        {
            Regex r = new Regex("([ \\t{}():;. \n])");
            //input = input.ToLower();

            String[] tokens = r.Split(input);

            ArrayList filter = new ArrayList();

            for (int i = 0; i < tokens.Length; i++)
            {
                MatchCollection mc = r.Matches(tokens[i]);
                if (mc.Count <= 0 && tokens[i].Trim().Length > 0
                    && !StopWordsHandler.IsStopword(tokens[i]))
                    filter.Add(tokens[i]);


            }

            return ArrayListToArray(filter);
        }


        public Tokeniser()
        {
        }
    }
}



button3 is compare functionality.in this scope i have to write similarity logic in terms of percentage.

oncle please check the code for similarty b/w the two pdf files.if any probelm please intimate.please help me.


thank u.
Posted
Updated 4-Apr-15 0:04am
v2
Comments
Mehdi Gholam 4-Apr-15 6:01am    
"Not working properly" is not helpful information.
Krishna Veni 4-Apr-15 6:10am    
please check the what is the probelm
Krishna Veni 4-Apr-15 7:17am    
not work properly means i got 0% b/w any near Duplicate files but i want correct percentage my point of view logic is correct but i don't know how to 0% is returned in button 3.please help me.Finally i want similarity b/w two pdf files interms of percentage.please help me.thank u.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900