Click here to Skip to main content
15,887,822 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
The recource using is almost good(10-20% CPU, RAM starts from 8Mb and grows), file comparer is normal, but program is slow on 7000+ files. About 20-30 minutes. How optimize? Maybe i shouldn't use recursive method in functions? Maybe the structure is not optimized? Please, help.
C#
using System;
using System.Collections.Generic;
using System.IO;
using System.Diagnostics;

namespace ConsoleApplication3
{
    class Comparer
    {
        static void Main(string[] args)
        {
            Stopwatch s1 = new Stopwatch();
            s1.Start();
            switch (args.Length)
            {
                case 1:
                    {
                        CompareFilesRec(LookIn(args[0]));
                        s1.Stop();
                        break;
                    }
                default:
                    {
                        Console.WriteLine("Type only one argument.");
                        break;
                    }

            }
            
            Console.WriteLine("{0} ms", s1.ElapsedMilliseconds);
        }

        static bool CheckFile(string file)
        {
            FileInfo someFileInfo = new FileInfo(file);//берем інфу про файл в змінну someFileInfo
            if (someFileInfo.Length >= 2147483648 || someFileInfo.Length < 1)//перевірка
                return false;
            else return true;
        }
        static bool CheckDirEmpty(string dir)
        {
            DirectoryInfo someDir = new DirectoryInfo(dir);
            if (someDir.GetFiles().Length > 0)
                return false;
            else return true;
        }

        static List<string> LookIn(string path)
        {
            Stopwatch s1 = new Stopwatch();
            s1.Start();
            /*Ініціалізую і об*являю два списки:для файлів і папок*/
            List<string> files = new List<string>();
            List<string> dirs = new List<string>();

            /*Шукаєм всі доступні файли*/
            try
            {
                /*Добавляю знайдені папки і файли в список*/
                files.AddRange(Directory.GetFiles(path));
                dirs.AddRange(Directory.GetDirectories(path));    
            }
            catch (UnauthorizedAccessException) { }
            catch (DirectoryNotFoundException) { }
            catch (ArgumentOutOfRangeException) { }
            catch (IOException) { }

            for (int i = files.Count - 1; i >= 0; i--)
            {
                if (!CheckFile(files[i]))
                    files.RemoveAt(i);
            }
            for (int k = dirs.Count - 1; k >= 0; k--)
            {
                if(CheckDirEmpty(dirs[k]))
                    dirs.RemoveAt(k);
            }
            
            /*"Заглядаєм" за файлами в кожну директорію...*/
            foreach (string dir in dirs)
            {
                files.AddRange(LookIn(dir));//...і додаєм до списку
            }
            s1.Stop();
            Console.WriteLine("LookIN  = {0} ms", s1.ElapsedMilliseconds);
            return files;//повертаєм повний список знайдених файлів
        }

        static void CompareFilesRec(List<string> array)
        {
            Stopwatch s1 = new Stopwatch();
            s1.Start();
            List<KeyValuePair<long, string>> yeah = new List<KeyValuePair<long, string>>();
            List<string> Trash = new List<string>();
            List<string> outp = new List<string>();

            for (int j = array.Count-1; j >= 0; j--)
            {
                FileInfo fii = new FileInfo(array[j]);
                yeah.Add(new KeyValuePair<long, string>(fii.Length, array[j]));
            }
            array.Clear();

            foreach (var el in yeah)
            {
                if (!array.Contains(el.Value) && !Trash.Contains(el.Value))
                {
                    foreach (var ele in yeah)
                    {
                        if (el.Key == ele.Key && !array.Contains(ele.Value))
                        {
                            array.Add(ele.Value);
                        }
                        else
                        {
                            if (!Trash.Contains(ele.Value))
                            {
                                Trash.Add(ele.Value);
                            }
                        }
                    }
                }
            }
            yeah.Clear();
            /*TODO*/
            //byte crc = Crc8.ComputeChecksum(1, 2);
            int Arr = ComputeByteChecksum(array[0]);
            foreach (string f in array)
            {
                int File = ComputeByteChecksum(f);
                if (f != array[0] && Arr == File)
                    outp.Add(f);
                if (Arr != File)
                    if (!Trash.Contains(f))
                         Trash.Add(f);
            }
            outp.Add(array[0]);
            array.Clear();
            /*TODO end*/
            if (outp.Count > 1)
            {
                foreach (string fi in outp)
                {
                    Console.WriteLine(fi);
                }
                outp.Clear();
                Console.WriteLine();
            }
            
            if (Trash.Count > 1)
            {
                CompareFilesRec(Trash);
            }
            Trash.Clear();
            s1.Stop();
            Console.WriteLine("Comparing = {0} ms", s1.ElapsedMilliseconds);
        }
        static int ComputeByteChecksum(string path)
        {
            using (var reader = new BinaryReader(File.OpenRead(path)))
            {
                byte b1 = reader.ReadByte();
                reader.BaseStream.Position = reader.BaseStream.Position = reader.BaseStream.Length >> 1;
                byte b2 = reader.ReadByte();
                reader.BaseStream.Position = reader.BaseStream.Length - 1;
                byte b3 = reader.ReadByte();
                reader.Close();
                byte crc = Crc8.ComputeChecksum(1, 2);
                return Crc8.ComputeChecksum(b1, crc) + Crc8.ComputeChecksum(b2, crc) + Crc8.ComputeChecksum(b3, crc); 
            }
        }
    }
    public static class Crc8
    {
        static byte[] table = new byte[256];

        const byte poly = 0xd5;

        public static byte ComputeChecksum(params byte[] bytes)
        {
            byte crc = 0;
            if (bytes != null && bytes.Length > 0)
            {
                foreach (byte b in bytes)
                {
                    crc = table[crc ^ b];
                }
            }
            return crc;
        }

        static Crc8()
        {
            for (int i = 0; i < 256; ++i)
            {
                int temp = i;
                for (int j = 0; j < 8; ++j)
                {
                    if ((temp & 0x80) != 0)
                    {
                        temp = (temp << 1) ^ poly;
                    }
                    else
                    {
                        temp <<= 1;
                    }
                }
                table[i] = (byte)temp;
            }
        }
    }
}
Posted
Updated 24-Nov-12 23:32pm
v2

1 solution

You could look at a couple of approaches;

But first, what are you comparing? You are rolling your own CRCs, it maybe better to used some of the optimised .Net methods for building the CRCs or use larger hashes to reduce collission risk.

You could use a multithreaded approach, first build and index of files you are comparing, then get worker threads to then parrallel hash the files etc. the limiting factor is likely to be disk throughput, so you could keep increasing the thread count until just before you bottlekneck on the HDD subsystem.

Multithreading doesn't always help, so it would be beneficial to benchmark different approachs;
e.g. 1 thread - 1 file at a time
e.g. multiple threads - multiple files in parrallel
e.g. your own crc vs .Net classes (probably more optimised)
 
Share this answer
 
Comments
Je7 25-Nov-12 6:24am    
I just need to compare files(their content) in all directories. In my solution i take onle 3 bytes from file. In the beginning, in the half of file and in the end. So, how i understand you: create some hash array from that files(using multithreading) and than compare hashes?
DaveAuld 25-Nov-12 6:33am    
Have a look at this: http://www.codeproject.com/Articles/28512/Duplicate-Files-Finder it is doing a directory trawl and file comparison using MD5 hash. source is available.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900