no duplicates - Page 17

Suggestions for duplicate file finder algorithm (using C)

- by Andrei Ciobanu

Hello, I wanted to write a program that test if two files are duplicates (have exactly the same content). First I test if the files have the same sizes, and if they have i start to compare their contents. My first idea, was to "split" the files into fixed size blocks, then start a thread for every block, fseek to startup character of every block and continue the comparisons in parallel. When a comparison from a thread fails, the other working threads are canceled, and the program exits out of the thread spawning loop. The code looks like this: dupf.h #ifndef __NM__DUPF__H__ #define __NM__DUPF__H__ #define NUM_THREADS 15 #define BLOCK_SIZE 8192 /* Thread argument structure */ struct thread_arg_s { const char *name_f1; /* First file name */ const char *name_f2; /* Second file name */ int cursor; /* Where to seek in the file */ }; typedef struct thread_arg_s thread_arg; /** * 'arg' is of type thread_arg. * Checks if the specified file blocks are * duplicates. */ void *check_block_dup(void *arg); /** * Checks if two files are duplicates */ int check_dup(const char *name_f1, const char *name_f2); /** * Returns a valid pointer to a file. * If the file (given by the path/name 'fname') cannot be opened * in 'mode', the program is interrupted an error message is shown. **/ FILE *safe_fopen(const char *name, const char *mode); #endif dupf.c #include <errno.h> #include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include "dupf.h" FILE *safe_fopen(const char *fname, const char *mode) { FILE *f = NULL; f = fopen(fname, mode); if (f == NULL) { char emsg[255]; sprintf(emsg, "FOPEN() %s\t", fname); perror(emsg); exit(-1); } return (f); } void *check_block_dup(void *arg) { const char *name_f1 = NULL, *name_f2 = NULL; /* File names */ FILE *f1 = NULL, *f2 = NULL; /* Streams */ int cursor = 0; /* Reading cursor */ char buff_f1[BLOCK_SIZE], buff_f2[BLOCK_SIZE]; /* Character buffers */ int rchars_1, rchars_2; /* Readed characters */ /* Initializing variables from 'arg' */ name_f1 = ((thread_arg*)arg)->name_f1; name_f2 = ((thread_arg*)arg)->name_f2; cursor = ((thread_arg*)arg)->cursor; /* Opening files */ f1 = safe_fopen(name_f1, "r"); f2 = safe_fopen(name_f2, "r"); /* Setup cursor in files */ fseek(f1, cursor, SEEK_SET); fseek(f2, cursor, SEEK_SET); /* Initialize buffers */ rchars_1 = fread(buff_f1, 1, BLOCK_SIZE, f1); rchars_2 = fread(buff_f2, 1, BLOCK_SIZE, f2); if (rchars_1 != rchars_2) { /* fread failed to read the same portion. * program cannot continue */ perror("ERROR WHEN READING BLOCK"); exit(-1); } while (rchars_1-->0) { if (buff_f1[rchars_1] != buff_f2[rchars_1]) { /* Different characters */ fclose(f1); fclose(f2); pthread_exit("notdup"); } } /* Close streams */ fclose(f1); fclose(f2); pthread_exit("dup"); } int check_dup(const char *name_f1, const char *name_f2) { int num_blocks = 0; /* Number of 'blocks' to check */ int num_tsp = 0; /* Number of threads spawns */ int tsp_iter = 0; /* Iterator for threads spawns */ pthread_t *tsp_threads = NULL; thread_arg *tsp_threads_args = NULL; int tsp_threads_iter = 0; int thread_c_res = 0; /* Thread creation result */ int thread_j_res = 0; /* Thread join res */ int loop_res = 0; /* Function result */ int cursor; struct stat buf_f1; struct stat buf_f2; if (name_f1 == NULL || name_f2 == NULL) { /* Invalid input parameters */ perror("INVALID FNAMES\t"); return (-1); } if (stat(name_f1, &buf_f1) != 0 || stat(name_f2, &buf_f2) != 0) { /* Stat fails */ char emsg[255]; sprintf(emsg, "STAT() ERROR: %s %s\t", name_f1, name_f2); perror(emsg); return (-1); } if (buf_f1.st_size != buf_f2.st_size) { /* File have different sizes */ return (1); } /* Files have the same size, function exec. is continued */ num_blocks = (buf_f1.st_size / BLOCK_SIZE) + 1; num_tsp = (num_blocks / NUM_THREADS) + 1; cursor = 0; for (tsp_iter = 0; tsp_iter < num_tsp; tsp_iter++) { loop_res = 0; /* Create threads array for this spawn */ tsp_threads = malloc(NUM_THREADS * sizeof(*tsp_threads)); if (tsp_threads == NULL) { perror("TSP_THREADS ALLOC FAILURE\t"); return (-1); } /* Create arguments for every thread in the current spawn */ tsp_threads_args = malloc(NUM_THREADS * sizeof(*tsp_threads_args)); if (tsp_threads_args == NULL) { perror("TSP THREADS ARGS ALLOCA FAILURE\t"); return (-1); } /* Initialize arguments and create threads */ for (tsp_threads_iter = 0; tsp_threads_iter < NUM_THREADS; tsp_threads_iter++) { if (cursor >= buf_f1.st_size) { break; } tsp_threads_args[tsp_threads_iter].name_f1 = name_f1; tsp_threads_args[tsp_threads_iter].name_f2 = name_f2; tsp_threads_args[tsp_threads_iter].cursor = cursor; thread_c_res = pthread_create( &tsp_threads[tsp_threads_iter], NULL, check_block_dup, (void*)&tsp_threads_args[tsp_threads_iter]); if (thread_c_res != 0) { perror("THREAD CREATION FAILURE"); return (-1); } cursor+=BLOCK_SIZE; } /* Join last threads and get their status */ while (tsp_threads_iter-->0) { void *thread_res = NULL; thread_j_res = pthread_join(tsp_threads[tsp_threads_iter], &thread_res); if (thread_j_res != 0) { perror("THREAD JOIN FAILURE"); return (-1); } if (strcmp((char*)thread_res, "notdup")==0) { loop_res++; /* Closing other threads and exiting by condition * from loop. */ while (tsp_threads_iter-->0) { pthread_cancel(tsp_threads[tsp_threads_iter]); } } } free(tsp_threads); free(tsp_threads_args); if (loop_res > 0) { break; } } return (loop_res > 0) ? 1 : 0; } The function works fine (at least for what I've tested). Still, some guys from #C (freenode) suggested that the solution is overly complicated, and it may perform poorly because of parallel reading on hddisk. What I want to know: Is the threaded approach flawed by default ? Is fseek() so slow ? Is there a way to somehow map the files to memory and then compare them ?

Search Results

Search found 886 results on 36 pages for 'no duplicates'.

Page 17/36 | < Previous Page | 13 14 15 16 17 18 19 20 21 22 23 24 | Next Page >

- by Andrei Ciobanu

- by David

- by Yogesh

- by Blaine LaFreniere

- by Pandiya Chendur

- by Matt S.

- by Donny P

- by x3ro

- by Amy

- by gurukulki

- by Wilhelmsen

- by krike

- by Joseph

- by calccrypto

- by Female Gay

- by Nathan Fellman

- by Peter Stegnar

- by Gabriel

- by wrongusername

- by gneash

- by Gazow

- by Stecy

- by bobo

- by zsharp

- by Dave

< Previous Page | 13 14 15 16 17 18 19 20 21 22 23 24 | Next Page >