diff --git a/src/gen-word-list.c b/src/gen-word-list.c index d3c4a868a1f386c185496cd1089e7c795291601d..fffd0a653dd5edf5e57298c084bbd4c01e1264dd 100644 --- a/src/gen-word-list.c +++ b/src/gen-word-list.c @@ -351,8 +351,8 @@ gen_word_list_remove_duplicates (GenWordList *word_list) } static gint -word_array_sort (gconstpointer a, - gconstpointer b) +word_entry_comparator (gconstpointer a, + gconstpointer b) { WordEntry *entry_a = (WordEntry *)a; WordEntry *entry_b = (WordEntry *)b; @@ -367,7 +367,7 @@ word_array_sort (gconstpointer a, void gen_word_list_sort (GenWordList *word_list) { - g_array_sort (word_list->words, (GCompareFunc) word_array_sort); + g_array_sort (word_list->words, (GCompareFunc) word_entry_comparator); /* Enumerations are pre-sorted from the dupe removal */ #if 0 for (uint i = 0; i < word_list->words->len; i++) diff --git a/src/word-list-misc.c b/src/word-list-misc.c index 7eaa2cfc2c21fefcbf955102ce28105a91f5e299..5186264d9fdc77a67d361caa053dbc662fc74507 100644 --- a/src/word-list-misc.c +++ b/src/word-list-misc.c @@ -306,6 +306,59 @@ word_array_find (WordArray *word_array, out); } +static gint +word_index_comparator (gconstpointer a, + gconstpointer b) +{ + gint index_a = ((WordIndex *) a)->index; + gint index_b = ((WordIndex *) b)->index; + + return index_a - index_b; +} + +void +word_array_sort (WordArray *word_array) +{ + if (word_array == NULL) + return; + g_array_sort ((GArray *) word_array, word_index_comparator); +} + +gboolean +word_array_equals (WordArray *word_array1, WordArray *word_array2) +{ + if (word_array1->len != word_array2->len) + return FALSE; + for (guint i = 0; i < word_array1->len; i++) + { + gint index1 = (g_array_index (word_array1, WordIndex, i)).index; + gint index2 = (g_array_index (word_array2, WordIndex, i)).index; + if (index1 != index2) + return FALSE; + } + return TRUE; +} + +#define MAX_WORDS 100 +void +word_array_print (WordArray *word_array) +{ + if (word_array == NULL) + { + g_message ("WordArray is NULL."); + return; + } + WordIndex current; + g_message ("WordArray %p:", word_array); + /* FIXME(debugging): Maybe better to print them all in one line. */ + /* FIXME(debugging): Print first 50 and last 50 words instead of first 100. */ + for (guint i = 0; i < MIN (word_array->len, MAX_WORDS); i++) + { + current = g_array_index (word_array, WordIndex, i); + g_message ("\t%d", current.index); + } +} + #ifdef TESTING static void computes_letters_size (void) diff --git a/src/word-list-misc.h b/src/word-list-misc.h index 180a803eb6bfb5551bc8b3622c8e09d76662a79d..fa3b6877f107937c374bd2a5f2a8fbb4711b0a8f 100644 --- a/src/word-list-misc.h +++ b/src/word-list-misc.h @@ -75,15 +75,15 @@ gchar *scored_parse_word (const gchar *unparsed_word, IpuzCharset *alphabet); -/* WordArrays are a simple list of words represented as +/* A WordArray is a simple list of words represented as * WordIndex. They are unique: inserting a word multiple times results - * in the word only existing once, and the array is always sorted. It - * is used to keep a list of words we don't want to search through, as - * well as used internally within the word-list. + * in the word only existing once. It is used to keep a list of words + * we don't want to search through, as well as used internally within + * the word-list. * * It's possible to use the GArray functions instead of the WordArray - * functions, but if you do that, you need to keep the - * uniqueness/sorted invariant true. The WordList does this directly + * functions. But if you do that, you need to manually keep the + * uniqueness invariant true. The WordList does this directly * at times. * * It's not recommended that you use this structure unless you really @@ -102,6 +102,11 @@ gboolean word_array_remove (WordArray *word_array, gboolean word_array_find (WordArray *word_array, WordIndex word_index, guint *out); +void word_array_sort (WordArray *word_array); +gboolean word_array_equals (WordArray *word_array1, + WordArray *word_array2); +void word_array_print (WordArray *word_array); + #define word_array_len(wa) (((GArray*)wa)->len) #define word_array_index(wa,i) (g_array_index((GArray*)wa,WordIndex,i)) #define word_array_ref(wa) (g_array_ref((GArray*)wa)) diff --git a/src/word-list-tests.c b/src/word-list-tests.c index d52191a9471d6902af4f706492c4f830669649b8..d84ab52d4be3935bac1e44768e9903eb854395da 100644 --- a/src/word-list-tests.c +++ b/src/word-list-tests.c @@ -45,14 +45,15 @@ //#define PROFILE_TEST +/* FIXME(tests): Intersection function generates bad results with TEST_RESOURCE. */ static WordList * -get_test_word_list (void) +get_word_list (const gchar *resource_path) { g_autofree gchar *path = NULL; g_autoptr (WordListResource) resource = NULL; WordList *word_list = NULL; - path = g_test_build_filename (G_TEST_BUILT, TEST_RESOURCE, NULL); + path = g_test_build_filename (G_TEST_BUILT, resource_path, NULL); resource = word_list_resource_new_from_file (path); word_list = g_object_new (WORD_TYPE_LIST, @@ -62,6 +63,18 @@ get_test_word_list (void) return word_list; } +static WordList * +get_test_word_list (void) +{ + return get_word_list (TEST_RESOURCE); +} + +static WordList * +get_broda_word_list (void) +{ + return get_word_list (BRODA_RESOURCE); +} + #if 0 static void dump_bytes (GBytes *bytes) @@ -254,6 +267,29 @@ utf8_roundtrip (void) #endif +static gboolean +word_array_is_sorted (WordArray *word_array) +{ + gint prev_index; + gint curr_index; + WordIndex current; + + if (word_array == NULL || word_array->len == 0) + return TRUE; + + current = g_array_index (word_array, WordIndex, 0); + prev_index = current.index; + for (guint i = 0; i < word_array->len; i++) + { + current = g_array_index (word_array, WordIndex, i); + curr_index = current.index; + if (curr_index < prev_index) + return FALSE; + prev_index = curr_index; + } + return TRUE; +} + static void anagram_test (void) { @@ -275,7 +311,7 @@ anagram_test (void) //#define DEBUG_INTERSECTION static void -intersection_test (WordList *wl, +intersection_test (WordList *wl, const gchar *filter1, guint pos1, const gchar *filter2, @@ -326,8 +362,10 @@ intersection_test (WordList *wl, g_print ("%s\n", word_list_get_indexed_word (wl, word_index)); } #endif - g_assert_cmpstr (intersection_str, ==, charset_str); + g_assert_cmpstr (intersection_str, ==, charset_str); + g_assert (word_array_is_sorted (word_array1)); + g_assert (word_array_is_sorted (word_array2)); } static void @@ -363,6 +401,51 @@ find_intersection (void) "AELRT"); } +static void +equivalent_intersections_test (WordList *wl, + const gchar *filter) +{ + const gchar* crossing_filter; + const gchar *ptr; + g_autoptr (WordArray) prev_word_array = NULL; + g_autoptr (WordArray) curr_word_array = NULL; + + crossing_filter = g_strnfill (g_utf8_strlen (filter, -1), '?'); + + ptr = filter; + for (guint n = 0; *ptr != '\0'; ptr = g_utf8_next_char (ptr), n++) + { + if (*ptr == '?') + { + word_list_find_intersection (wl, + filter, n, + crossing_filter, 0, + NULL, + &curr_word_array, + NULL); + g_assert (word_array_is_sorted (curr_word_array)); + if (prev_word_array == NULL) + prev_word_array = curr_word_array; + else + g_assert(word_array_equals (curr_word_array, prev_word_array)); + } + } +} + +static void +equivalent_intersections (void) +{ + g_autoptr (WordList) wl = get_broda_word_list (); + equivalent_intersections_test (wl, "??"); + equivalent_intersections_test (wl, "???"); + equivalent_intersections_test (wl, "??????????????"); + equivalent_intersections_test (wl, "A??"); + equivalent_intersections_test (wl, "?A?"); + equivalent_intersections_test (wl, "??A"); + equivalent_intersections_test (wl, "???A???A???A???"); + equivalent_intersections_test (wl, "ABBACCCCBABA"); +} + #define EPSILON 0.001 static void @@ -446,6 +529,7 @@ main (int argc, char **argv) g_test_add_func ("/word_list/set_anagram", anagram_test); g_test_add_func ("/word_list/find_intersection", find_intersection); + g_test_add_func ("/word_list/equivalent_intersections", equivalent_intersections); g_test_add_func ("/word_list/frequency_test", frequency_test); #ifdef PROFILE_TEST diff --git a/src/word-list.c b/src/word-list.c index 8ed77beba37732f98004e4f8b5dacee566f90e35..60cbefb528fec7e81fe5cddbba43aad2294000b2 100644 --- a/src/word-list.c +++ b/src/word-list.c @@ -1028,6 +1028,17 @@ word_list_dump (WordList *word_list) } +static void +sort_word_arrays (WordArray **word_array1, + WordArray **word_array2) +{ + if (word_array1 != NULL) + word_array_sort (*word_array1); + if (word_array2 != NULL) + word_array_sort (*word_array2); +} + + static GArray * get_filter_intersect_list (WordList *word_list, const gchar *filter) @@ -1278,6 +1289,7 @@ calculate_intersect_special (WordList *word_list, ipuz_charset_builder_set_char_count (builder, c, filter1_len * filter2_len); *intersecting_chars = ipuz_charset_builder_build (builder); } + sort_word_arrays (word_array1, word_array2); } @@ -1461,6 +1473,7 @@ word_list_find_intersection (WordList *word_list, word_array1?*word_array1:NULL); if (intersecting_chars) *intersecting_chars = ipuz_charset_ref (charset1); + sort_word_arrays (word_array1, word_array2); } else if (filter2_valid) { @@ -1471,6 +1484,7 @@ word_list_find_intersection (WordList *word_list, word_array2?*word_array2:NULL); if (intersecting_chars) *intersecting_chars = ipuz_charset_ref (charset2); + sort_word_arrays (word_array1, word_array2); } return; @@ -1505,6 +1519,8 @@ word_list_find_intersection (WordList *word_list, charset2, word_array1?*word_array1:NULL); + sort_word_arrays (word_array1, word_array2); + /* copy our charset over to intersecting_chars */ if (intersecting_chars) *intersecting_chars = ipuz_charset_ref (charset2);