Skip to content
Snippets Groups Projects
Commit 9537672d authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

kr_refactor

parent acc68ceb
Branches index
No related tags found
No related merge requests found
...@@ -15,6 +15,19 @@ typedef struct { ...@@ -15,6 +15,19 @@ typedef struct {
seq_t * seq; seq_t * seq;
} hashed_object_t; } hashed_object_t;
/* Compare must return the distance between the object and the value.
* So 0 correspond to match
*/
int compare (const void* arg, const void* obj) {
hashed_object_t * hashed = (hashed_object_t *)obj;
uint64_t hash = *((uint64_t *)arg);
if (hash == hashed->kmers[0].hash || hash == hashed->kmers[1].hash)
return 0;
else
return 1;
}
/* Hashtable to store all the reads and main kmers regarding the hashed value of the kmer */ /* Hashtable to store all the reads and main kmers regarding the hashed value of the kmer */
static tommy_hashdyn hashtable; static tommy_hashdyn hashtable;
...@@ -33,8 +46,13 @@ int kr_init (uint nb_seqs, kmer_t * kmers, seq_t * sequences) { ...@@ -33,8 +46,13 @@ int kr_init (uint nb_seqs, kmer_t * kmers, seq_t * sequences) {
object->kmers = kmers + idx*2; object->kmers = kmers + idx*2;
object->seq = sequences + idx; object->seq = sequences + idx;
tommy_hashdyn_insert(&hashtable, &object->node, object, object->kmers[0].hash); tommy_hashdyn_insert(&hashtable, &(object->node), object, object->kmers[0].hash);
tommy_hashdyn_insert(&hashtable, &object->node, object, object->kmers[1].hash);
object = malloc(sizeof(hashed_object_t));
object->kmers = kmers + idx*2;
object->seq = sequences + idx;
tommy_hashdyn_insert(&hashtable, &(object->node), object, object->kmers[1].hash);
} }
return TRUE; return TRUE;
...@@ -57,29 +75,23 @@ typedef struct { ...@@ -57,29 +75,23 @@ typedef struct {
void search_hash (uint64_t h, seq_t * ref, uint seq_offset, result_t * result) { void search_hash (uint64_t h, seq_t * ref, uint seq_offset, result_t * result) {
/* search for the hash value */ /* search for the hash value */
tommy_node* i = tommy_hashdyn_bucket(&hashtable, h); hashed_object_t * obj = tommy_hashdyn_search(&hashtable, compare, &h, h);
uint nb_obj = 0;
// Init
result->seq = NULL;
/* Look for possible collided values */ if (obj) {
while (i) {
hashed_object_t * obj = i->data;
if (h == obj->kmers[0].hash && verify_sequence(ref, seq_offset, obj)) { if (h == obj->kmers[0].hash && verify_sequence(ref, seq_offset, obj)) {
/* Create the result */ /* Create the result */
result->offset = seq_offset-obj->kmers[0].offset; result->offset = seq_offset-obj->kmers[0].offset;
result->seq = obj->seq; result->seq = obj->seq;
result->direction = '+'; result->direction = '+';
break;
} else if (h == obj->kmers[1].hash && verify_revert_sequence(ref, seq_offset, obj)) { } else if (h == obj->kmers[1].hash && verify_revert_sequence(ref, seq_offset, obj)) {
result->offset = seq_offset - obj->seq->length + obj->kmers[1].offset + HASH_SIZE; result->offset = seq_offset - obj->seq->length + obj->kmers[1].offset + HASH_SIZE;
result->seq = obj->seq; result->seq = obj->seq;
result->direction = '-'; result->direction = '-';
break;
} }
i = i->next;
} }
result->seq = NULL;
} }
int verify_sequence (seq_t * ref, uint seq_offset, hashed_object_t * obj) { int verify_sequence (seq_t * ref, uint seq_offset, hashed_object_t * obj) {
...@@ -138,7 +150,6 @@ int kr_align (seq_t * seq, FILE * output) { ...@@ -138,7 +150,6 @@ int kr_align (seq_t * seq, FILE * output) {
/* --- Hash recurence --- */ /* --- Hash recurence --- */
for (uint idx=0 ; idx<seq->length - HASH_SIZE ; idx++) { for (uint idx=0 ; idx<seq->length - HASH_SIZE ; idx++) {
h = hash_decal(h, seq->value[idx+HASH_SIZE-1]); h = hash_decal(h, seq->value[idx+HASH_SIZE-1]);
printf("%lu\n", h);
// --- Search forward hash --- // --- Search forward hash ---
search_hash(h, seq, idx, &result); search_hash(h, seq, idx, &result);
// Interpret search result // Interpret search result
......
...@@ -129,17 +129,19 @@ char * fasta_against_sequence_with_overlap_and_rev_comp_test () { ...@@ -129,17 +129,19 @@ char * fasta_against_sequence_with_overlap_and_rev_comp_test () {
mu_assert("Impossible to execute a kr alignment with the sequence 0", kr_align(sequences, tmp)); mu_assert("Impossible to execute a kr alignment with the sequence 0", kr_align(sequences, tmp));
destroy_buffer(sequences_buffer); destroy_buffer(sequences_buffer);
free(sequences); free(sequences);
fclose(tmp);
tmp = fopen(tmp_file, "r"); tmp = fopen(tmp_file, "r");
size_t buff_size = 1024; size_t buff_size = 1024;
char * buffer = malloc(buff_size * sizeof(char)); char * buffer = malloc(buff_size * sizeof(char));
mu_assert("0 lines in output", getline(&buffer, &buff_size, tmp) == 0); int lines = getline(&buffer, &buff_size, tmp);
mu_assert("0 lines in output", lines != -1);
mu_assert("test allele 2 not found at position 964", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_2\t+\t964\n") == 0); mu_assert("test allele 2 not found at position 964", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_2\t+\t964\n") == 0);
mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) == 0); mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) != -1);
mu_assert("test allele 2 not found at position 1929", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_2\t+\t1929\n") == 0); mu_assert("test allele 2 not found at position 1929", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_2\t+\t1929\n") == 0);
mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) == 0); mu_assert("not enougth lines in output", getline(&buffer, &buff_size, tmp) != -1);
mu_assert("test allele 3 not found at position 2895", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_3\t-\t2895\n") == 0); mu_assert("test allele 3 not found at position 2895", strcmp(buffer, "Test_overlapping Must find allele 2x2 but not the 1 and 3 (1 nucleotide missing)\trbsC_S_3\t-\t3829\n") == 0);
mu_assert("Too much matches on test_seq_overlapp.fasta", getline(&buffer, &buff_size, tmp) != 0); mu_assert("Too much matches on test_seq_overlapp.fasta", getline(&buffer, &buff_size, tmp) != 0);
free(buffer); free(buffer);
...@@ -166,13 +168,13 @@ char * rabinkarp_tests() { ...@@ -166,13 +168,13 @@ char * rabinkarp_tests() {
char filename[] = "tests/data/data2.fasta"; char filename[] = "tests/data/data2.fasta";
buffer_t * alleles_buffer = init_buffer(filename); buffer_t * alleles_buffer = init_buffer(filename);
uint nb_seq = read_sequences(filename, &alleles, alleles_buffer); nb_seqs = read_sequences(filename, &alleles, alleles_buffer);
/* --- init kmers --- */ /* --- init kmers --- */
FILE * kmers_file = fopen("tests/data/data2.bin", "r"); FILE * kmers_file = fopen("tests/data/data2.bin", "r");
kmers_init(kmers_file); kmers_init(kmers_file);
kmers = malloc(2 * nb_seq * sizeof(kmer_t)); kmers = malloc(2 * nb_seqs * sizeof(kmer_t));
next_kmers(kmers_file, kmers, nb_seq * 2); next_kmers(kmers_file, kmers, nb_seqs * 2);
fclose(kmers_file); fclose(kmers_file);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment