diff --git a/src/kr_aligner.c b/src/kr_aligner.c index 82746d53ed0c496d9836e3f61e86ba9f2c0ee6ef..44981f265b1bbf9ca08b2ae47db4c0fc832f4bc9 100644 --- a/src/kr_aligner.c +++ b/src/kr_aligner.c @@ -15,6 +15,19 @@ typedef struct { seq_t * seq; } hashed_object_t; +/* Compare must return the distance between the object and the value. + * So 0 correspond to match + */ +int compare (const void* arg, const void* obj) { + hashed_object_t * hashed = (hashed_object_t *)obj; + uint64_t hash = *((uint64_t *)arg); + + if (hash == hashed->kmers[0].hash || hash == hashed->kmers[1].hash) + return 0; + else + return 1; +} + /* Hashtable to store all the reads and main kmers regarding the hashed value of the kmer */ static tommy_hashdyn hashtable; @@ -33,8 +46,13 @@ int kr_init (uint nb_seqs, kmer_t * kmers, seq_t * sequences) { object->kmers = kmers + idx*2; object->seq = sequences + idx; - tommy_hashdyn_insert(&hashtable, &object->node, object, object->kmers[0].hash); - tommy_hashdyn_insert(&hashtable, &object->node, object, object->kmers[1].hash); + tommy_hashdyn_insert(&hashtable, &(object->node), object, object->kmers[0].hash); + + object = malloc(sizeof(hashed_object_t)); + object->kmers = kmers + idx*2; + object->seq = sequences + idx; + + tommy_hashdyn_insert(&hashtable, &(object->node), object, object->kmers[1].hash); } return TRUE; @@ -57,29 +75,23 @@ typedef struct { void search_hash (uint64_t h, seq_t * ref, uint seq_offset, result_t * result) { /* search for the hash value */ - tommy_node* i = tommy_hashdyn_bucket(&hashtable, h); - uint nb_obj = 0; + hashed_object_t * obj = tommy_hashdyn_search(&hashtable, compare, &h, h); + + // Init + result->seq = NULL; - /* Look for possible collided values */ - while (i) { - hashed_object_t * obj = i->data; + if (obj) { if (h == obj->kmers[0].hash && verify_sequence(ref, seq_offset, obj)) { /* Create the result */ result->offset = seq_offset-obj->kmers[0].offset; result->seq = obj->seq; result->direction = '+'; - break; } else if (h == obj->kmers[1].hash && verify_revert_sequence(ref, seq_offset, obj)) { result->offset = seq_offset - obj->seq->length + obj->kmers[1].offset + HASH_SIZE; result->seq = obj->seq; result->direction = '-'; - break; } - - i = i->next; } - - result->seq = NULL; } int verify_sequence (seq_t * ref, uint seq_offset, hashed_object_t * obj) { @@ -138,7 +150,6 @@ int kr_align (seq_t * seq, FILE * output) { /* --- Hash recurence --- */ for (uint idx=0 ; idx<seq->length - HASH_SIZE ; idx++) { h = hash_decal(h, seq->value[idx+HASH_SIZE-1]); - printf("%lu\n", h); // --- Search forward hash --- search_hash(h, seq, idx, &result); // Interpret search result diff --git a/tests/kr_tests.c b/tests/kr_tests.c index ed5e4dac6633b695109f54234d8289a1fd2f086c..b2d2d9712f23952e600f0ca26c4ba00dd7cfa35a 100644 --- a/tests/kr_tests.c +++ b/tests/kr_tests.c @@ -129,17 +129,19 @@ char * fasta_against_sequence_with_overlap_and_rev_comp_test () { mu_assert("Impossible to execute a kr alignment with the sequence 0", kr_align(sequences, tmp)); destroy_buffer(sequences_buffer); free(sequences); + fclose(tmp); tmp = fopen(tmp_file, "r"); size_t buff_size = 1024; char * buffer = malloc(buff_size * sizeof(char)); - mu_assert("0 lines in output", getline(&buffer, &buff_size, tmp) == 0); + int lines = getline(&buffer, &buff_size, tmp); + mu_assert("0 lines in output", lines != -1); mu_assert("test allele 2 not found at position 964", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_2\t+\t964\n") == 0); - mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) == 0); + mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) != -1); mu_assert("test allele 2 not found at position 1929", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_2\t+\t1929\n") == 0); - mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) == 0); - mu_assert("test allele 3 not found at position 2895", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_3\t-\t2895\n") == 0); + mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) != -1); + mu_assert("test allele 3 not found at position 2895", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_3\t-\t3829\n") == 0); mu_assert("Too much matches on test_seq_overlapp.fasta", getline(&buffer, &buff_size, tmp) != 0); free(buffer); @@ -166,13 +168,13 @@ char * rabinkarp_tests() { char filename[] = "tests/data/data2.fasta"; buffer_t * alleles_buffer = init_buffer(filename); - uint nb_seq = read_sequences(filename, &alleles, alleles_buffer); + nb_seqs = read_sequences(filename, &alleles, alleles_buffer); /* --- init kmers --- */ FILE * kmers_file = fopen("tests/data/data2.bin", "r"); kmers_init(kmers_file); - kmers = malloc(2 * nb_seq * sizeof(kmer_t)); - next_kmers(kmers_file, kmers, nb_seq * 2); + kmers = malloc(2 * nb_seqs * sizeof(kmer_t)); + next_kmers(kmers_file, kmers, nb_seqs * 2); fclose(kmers_file);