Commit 4fa9cf28 authored by Aleksander Morgado's avatar Aleksander Morgado Committed by Martyn Russell

Fixes GB#616845 - Avoid word counting in the extractors

 * New max_bytes parameter added to tracker-extract config file. Extractors will
    read up to that configured limit.
 * Removed the need of reading the FTS config file from tracker-extract.
 * Word counting not done now in the extractors.

 Note: As a side-effect, last word extracted when reached max_bytes may get cut
  and only first chunk of it extracted.
parent 4207c6a4
......@@ -183,7 +183,7 @@ tracker_coalesce (gint n_values,
* Since: 0.9
**/
gchar *
tracker_merge_const (const gchar *delimiter,
tracker_merge_const (const gchar *delimiter,
gint n_values,
...)
{
......@@ -239,7 +239,7 @@ tracker_merge_const (const gchar *delimiter,
* Deprecated: 1.0: Use tracker_merge_const() instead.
**/
gchar *
tracker_merge (const gchar *delimiter,
tracker_merge (const gchar *delimiter,
gint n_values,
...)
{
......@@ -304,6 +304,8 @@ tracker_merge (const gchar *delimiter,
* be freed with g_free() when finished with, otherwise %NULL.
*
* Since: 0.8
*
* Deprecated: 1.0: Use tracker_text_validate_utf8() instead.
**/
gchar *
tracker_text_normalize (const gchar *text,
......@@ -345,16 +347,62 @@ tracker_text_normalize (const gchar *text,
}
if (n_words) {
if (!in_break) {
/* Count the last word */
words += 1;
}
if (!in_break) {
/* Count the last word */
words += 1;
}
*n_words = words;
}
return g_string_free (string, FALSE);
}
/**
* tracker_text_validate_utf8:
* @text: the text to validate
* @text_len: length of @text, or -1 if NIL-terminated
* @str: the string where to place the validated characters
*
* This function iterates through @text checking for UTF-8 validity
* using g_utf8_validate(), and appends the first chunk of valid characters
* to @str.
*
* Returns: %TRUE if valid UTF-8 in @text was appended to @str
*
* Since: 0.9
**/
gboolean
tracker_text_validate_utf8 (const gchar *text,
gsize text_len,
GString **str)
{
gsize len_to_validate;
g_return_val_if_fail (text, FALSE);
g_return_val_if_fail (str, FALSE);
len_to_validate = text_len >= 0 ? text_len : strlen (text);
if (len_to_validate > 0) {
const gchar *end = text;
/* Validate string, getting the pointer to first non-valid character
* (if any) or to the end of the string. */
g_utf8_validate (text, len_to_validate, &end);
if (end > text) {
/* Create string to output if not already as input */
if (*str == NULL) {
*str = g_string_new_len (text, end-text);
} else {
*str = g_string_append_len (*str, text, end-text);
}
return TRUE;
}
}
return FALSE;
}
/**
* tracker_date_format_to_iso8601:
* @date_string: the date in a string pointer
......
......@@ -34,11 +34,14 @@ gchar* tracker_coalesce (gint n_values,
gchar* tracker_merge (const gchar *delimiter,
gint n_values,
...) G_GNUC_DEPRECATED;
#endif /* TRACKER_DISABLE_DEPRECATED */
gchar* tracker_text_normalize (const gchar *text,
guint max_words,
guint *n_words);
guint *n_words) G_GNUC_DEPRECATED;
#endif /* TRACKER_DISABLE_DEPRECATED */
gboolean tracker_text_validate_utf8 (const gchar *text,
gsize text_len,
GString **str);
gchar* tracker_date_guess (const gchar *date_string);
gchar* tracker_date_format_to_iso8601 (const gchar *date_string,
const gchar *format);
......
......@@ -321,8 +321,6 @@ tracker_extract_SOURCES = \
tracker-dbus.h \
tracker-extract.c \
tracker-extract.h \
tracker-fts-config.c \
tracker-fts-config.h \
tracker-main.c \
tracker-main.h \
tracker-albumart-generic.h
......
......@@ -30,10 +30,12 @@
/* Default values */
#define DEFAULT_VERBOSITY 0
#define DEFAULT_MAX_BYTES 1048576 /* 1Mbyte */
typedef struct {
/* General */
gint verbosity;
gint max_bytes;
} TrackerConfigPrivate;
typedef struct {
......@@ -63,11 +65,13 @@ enum {
PROP_0,
/* General */
PROP_VERBOSITY
PROP_VERBOSITY,
PROP_MAX_BYTES
};
static ObjectToKeyFile conversions[] = {
{ G_TYPE_INT, "verbosity", GROUP_GENERAL, "Verbosity" },
{ G_TYPE_INT, "max_bytes", GROUP_GENERAL, "Max Bytes" },
};
G_DEFINE_TYPE (TrackerConfig, tracker_config, TRACKER_TYPE_CONFIG_FILE);
......@@ -93,6 +97,16 @@ tracker_config_class_init (TrackerConfigClass *klass)
DEFAULT_VERBOSITY,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
g_object_class_install_property (object_class,
PROP_VERBOSITY,
g_param_spec_int ("max_bytes",
"Max Bytes",
" Maximum number of UTF-8 bytes to extract [0,G_MAXINT]",
0,
G_MAXINT,
DEFAULT_MAX_BYTES,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
g_type_class_add_private (object_class, sizeof (TrackerConfigPrivate));
}
......@@ -105,7 +119,7 @@ static void
config_set_property (GObject *object,
guint param_id,
const GValue *value,
GParamSpec *pspec)
GParamSpec *pspec)
{
switch (param_id) {
/* General */
......@@ -114,6 +128,11 @@ config_set_property (GObject *object,
g_value_get_int (value));
break;
case PROP_MAX_BYTES:
tracker_config_set_max_bytes (TRACKER_CONFIG (object),
g_value_get_int (value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
break;
......@@ -136,6 +155,10 @@ config_get_property (GObject *object,
g_value_set_int (value, priv->verbosity);
break;
case PROP_MAX_BYTES:
g_value_set_int (value, priv->max_bytes);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
break;
......@@ -317,3 +340,34 @@ tracker_config_set_verbosity (TrackerConfig *config,
priv->verbosity = value;
g_object_notify (G_OBJECT (config), "verbosity");
}
gint
tracker_config_get_max_bytes (TrackerConfig *config)
{
TrackerConfigPrivate *priv;
g_return_val_if_fail (TRACKER_IS_CONFIG (config), DEFAULT_MAX_BYTES);
priv = TRACKER_CONFIG_GET_PRIVATE (config);
return priv->max_bytes;
}
void
tracker_config_set_max_bytes (TrackerConfig *config,
gint value)
{
TrackerConfigPrivate *priv;
g_return_if_fail (TRACKER_IS_CONFIG (config));
if (!tracker_keyfile_object_validate_int (config, "max_bytes", value)) {
return;
}
priv = TRACKER_CONFIG_GET_PRIVATE (config);
priv->max_bytes = value;
g_object_notify (G_OBJECT (config), "max_bytes");
}
......@@ -53,6 +53,10 @@ gint tracker_config_get_verbosity (TrackerConfig *config);
void tracker_config_set_verbosity (TrackerConfig *config,
gint value);
gint tracker_config_get_max_bytes (TrackerConfig *config);
void tracker_config_set_max_bytes (TrackerConfig *config,
gint value);
G_END_DECLS
#endif /* __TRACKER_EXTRACT_CONFIG_H__ */
......
......@@ -41,7 +41,7 @@ typedef struct {
const gchar *uri;
guint in_body : 1;
GString *plain_text;
guint n_words;
guint n_bytes_remaining;
} parser_data;
static void extract_html (const gchar *filename,
......@@ -212,24 +212,28 @@ parser_characters (void *data,
case READ_IGNORE:
break;
default:
if (pd->in_body && pd->n_words > 0) {
gchar *text;
guint n_words;
text = tracker_text_normalize (ch, pd->n_words, &n_words);
if (text && *text) {
g_string_append (pd->plain_text, text);
if (pd->in_body && pd->n_bytes_remaining > 0) {
gsize text_len;
text_len = strlen (ch);
if (tracker_text_validate_utf8 (ch,
(pd->n_bytes_remaining < text_len ?
pd->n_bytes_remaining :
text_len),
&pd->plain_text)) {
/* In the case of HTML, each string arriving this
* callback is independent to any other previous
* string, so need to add an explicit whitespace
* separator */
g_string_append_c (pd->plain_text, ' ');
if (n_words > pd->n_words) {
pd->n_words = 0;
} else {
pd->n_words -= n_words;
}
}
g_free (text);
if (pd->n_bytes_remaining > text_len) {
pd->n_bytes_remaining -= text_len;
} else {
pd->n_bytes_remaining = 0;
}
}
break;
}
......@@ -240,7 +244,7 @@ extract_html (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
TrackerFTSConfig *fts_config;
TrackerConfig *config;
htmlDocPtr doc;
parser_data pd;
gchar *filename;
......@@ -288,8 +292,8 @@ extract_html (const gchar *uri,
pd.uri = uri;
pd.plain_text = g_string_new (NULL);
fts_config = tracker_main_get_fts_config ();
pd.n_words = tracker_fts_config_get_max_words_to_index (fts_config);
config = tracker_main_get_config ();
pd.n_bytes_remaining = tracker_config_get_max_bytes (config);
filename = g_filename_from_uri (uri, NULL, NULL);
doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
......
......@@ -394,8 +394,6 @@ read_32bit (const guint8 *buffer)
* @param chunk_size Number of valid bytes in the input buffer
* @param is_ansi If %TRUE, input text should be encoded in CP1252, and
* in UTF-16 otherwise.
* @param p_words_remaining Pointer to #gint specifying how many words
* should still be considered.
* @param p_words_remaining Pointer to #gsize specifying how many bytes
* should still be considered.
* @param p_content Pointer to a #GString where the output normalized words
......@@ -405,7 +403,6 @@ static void
msoffice_convert_and_normalize_chunk (guint8 *buffer,
gsize chunk_size,
gboolean is_ansi,
gint *p_words_remaining,
gsize *p_bytes_remaining,
GString **p_content)
{
......@@ -415,7 +412,6 @@ msoffice_convert_and_normalize_chunk (guint8 *buffer,
g_return_if_fail (buffer != NULL);
g_return_if_fail (chunk_size > 0);
g_return_if_fail (p_words_remaining != NULL);
g_return_if_fail (p_bytes_remaining != NULL);
g_return_if_fail (p_content != NULL);
......@@ -432,42 +428,20 @@ msoffice_convert_and_normalize_chunk (guint8 *buffer,
&error);
if (converted_text) {
gchar *normalized_chunk;
guint n_words_normalized;
/* Get normalized chunk */
normalized_chunk = tracker_text_normalize (converted_text,
*p_words_remaining,
&n_words_normalized);
gsize len_to_validate;
/* Update number of words remaining.
* Note that n_words_normalized should always be less or
* equal than n_words_remaining */
*p_words_remaining = (n_words_normalized <= *p_words_remaining ?
*p_words_remaining - n_words_normalized : 0);
len_to_validate = MIN (*p_bytes_remaining, n_bytes_utf8);
/* Update accumulated UTF-8 bytes read */
*p_bytes_remaining = (n_bytes_utf8 <= *p_bytes_remaining ?
*p_bytes_remaining - n_bytes_utf8 : 0);
/* g_debug ("Words normalized: %u (remaining: %u); " */
/* "Bytes read (UTF-8): %" G_GSIZE_FORMAT " bytes " */
/* "(remaining: %" G_GSIZE_FORMAT ")", */
/* n_words_normalized, *p_words_remaining, */
/* n_bytes_utf8, *p_bytes_remaining); */
/* Append normalized chunk to the string to be returned */
if (*p_content) {
g_string_append (*p_content, normalized_chunk);
} else {
*p_content = g_string_new (normalized_chunk);
if (tracker_text_validate_utf8 (converted_text,
len_to_validate,
p_content)) {
/* A whitespace is added to separate next strings appended */
g_string_append_c (*p_content, ' ');
}
/* A whitespace is added to separate next strings appended */
g_string_append (*p_content, " ");
/* Update accumulated UTF-8 bytes read */
*p_bytes_remaining -= len_to_validate;
g_free (converted_text);
g_free (normalized_chunk);
} else {
g_warning ("Couldn't convert %" G_GSIZE_FORMAT " bytes from %s to UTF-8: %s",
chunk_size,
......@@ -659,7 +633,6 @@ ppt_seek_header (GsfInput *stream,
static gchar *
extract_powerpoint_content (GsfInfile *infile,
gint max_words,
gsize max_bytes,
gboolean *is_encrypted)
{
......@@ -733,18 +706,16 @@ extract_powerpoint_content (GsfInfile *infile,
SLIDELISTWITHTEXT_RECORD_TYPE,
SLIDELISTWITHTEXT_RECORD_TYPE,
FALSE)) {
gint words_remaining = max_words;
gsize bytes_remaining = max_bytes;
guint8 *buffer = NULL;
gsize buffer_size = 0;
/*
* Read while we have either TextBytesAtom or
* TextCharsAtom and we have read less than max_words
* amount of words and less than max_bytes (in UTF-8)
* TextCharsAtom and we have read less than max_bytes
* (in UTF-8)
*/
while (words_remaining > 0 &&
bytes_remaining > 0 &&
while (bytes_remaining > 0 &&
ppt_seek_header (stream,
TEXTBYTESATOM_RECORD_TYPE,
TEXTCHARSATOM_RECORD_TYPE,
......@@ -763,7 +734,6 @@ extract_powerpoint_content (GsfInfile *infile,
msoffice_convert_and_normalize_chunk (buffer,
read_size,
FALSE, /* Always UTF-16 */
&words_remaining,
&bytes_remaining,
&all_texts);
}
......@@ -777,45 +747,6 @@ extract_powerpoint_content (GsfInfile *infile,
return all_texts ? g_string_free (all_texts, FALSE) : NULL;
}
/**
* @brief get maximum number of words to index
* @return maximum number of words to index
*/
static gint
fts_max_words (void)
{
TrackerFTSConfig *fts_config;
fts_config = tracker_main_get_fts_config ();
return tracker_fts_config_get_max_words_to_index (fts_config);
}
/**
* @brief get min word length
* @return min_word_length
*/
static gint
fts_min_word_length (void)
{
TrackerFTSConfig *fts_config;
fts_config = tracker_main_get_fts_config ();
return tracker_fts_config_get_min_word_length (fts_config);
}
/**
* @brief get max word length
* @return max_word_length
*/
static gint
fts_max_word_length (void)
{
TrackerFTSConfig *fts_config;
fts_config = tracker_main_get_fts_config ();
return tracker_fts_config_get_max_word_length (fts_config);
}
/**
* @brief Open specified uri for reading and initialize gsf
* @param uri URI of the file to open
......@@ -847,7 +778,6 @@ open_uri (const gchar *uri)
*/
static gchar *
extract_msword_content (GsfInfile *infile,
gint n_words,
gsize n_bytes,
gboolean *is_encrypted)
{
......@@ -863,7 +793,6 @@ extract_msword_content (GsfInfile *infile,
GString *content = NULL;
guint8 *text_buffer = NULL;
gint text_buffer_size = 0;
guint n_words_remaining;
gsize n_bytes_remaining;
document_stream = gsf_infile_child_by_name (infile, "WordDocument");
......@@ -939,14 +868,11 @@ extract_msword_content (GsfInfile *infile,
/* Iterate over pieces...
* Loop is halted whenever one of this conditions is met:
* a) Max bytes to be read reached
* b) Already read up to the max number of words configured
* c) No more pieces to read
* b) No more pieces to read
*/
i = 0;
n_words_remaining = n_words;
n_bytes_remaining = n_bytes;
while (n_words_remaining > 0 &&
n_bytes_remaining > 0 &&
while (n_bytes_remaining > 0 &&
i < piece_count) {
guint8 *piece_descriptor;
gint piece_start;
......@@ -1009,7 +935,6 @@ extract_msword_content (GsfInfile *infile,
msoffice_convert_and_normalize_chunk (text_buffer,
piece_size,
is_ansi,
&n_words_remaining,
&n_bytes_remaining,
&content);
}
......@@ -1295,7 +1220,6 @@ read_excel_string (GsfInput *stream,
static void
xls_get_extended_record_string (GsfInput *stream,
GArray *list,
guint *p_words_remaining,
gsize *p_bytes_remaining,
GString **p_content)
{
......@@ -1337,12 +1261,10 @@ xls_get_extended_record_string (GsfInput *stream,
/* Iterate over chunks...
* Loop is halted whenever one of this conditions is met:
* a) Max bytes to be read reached
* b) Already read up to the max number of words configured
* c) No more chunks to read
* b) No more chunks to read
*/
i = 0;
while (*p_words_remaining > 0 &&
*p_bytes_remaining > 0 &&
while (*p_bytes_remaining > 0 &&
i < cst_unique) {
guint16 cch;
guint16 c_run;
......@@ -1398,7 +1320,6 @@ xls_get_extended_record_string (GsfInput *stream,
msoffice_convert_and_normalize_chunk (buffer,
chunk_size,
!is_high_byte,
p_words_remaining,
p_bytes_remaining,
p_content);
......@@ -1475,7 +1396,6 @@ xls_get_extended_record_string (GsfInput *stream,
*/
static gchar*
extract_excel_content (GsfInfile *infile,
gint n_words,
gsize n_bytes,
gboolean *is_encrypted)
{
......@@ -1483,7 +1403,6 @@ extract_excel_content (GsfInfile *infile,
GString *content = NULL;
GsfInput *stream;
guint saved_offset;
guint n_words_remaining = n_words;
gsize n_bytes_remaining = n_bytes;
stream = gsf_infile_child_by_name (infile, "Workbook");
......@@ -1493,8 +1412,7 @@ extract_excel_content (GsfInfile *infile,
}
/* Read until we reach eof or any of our limits reached */
while (n_words_remaining > 0 &&
n_bytes_remaining > 0 &&
while (n_bytes_remaining > 0 &&
!gsf_input_eof (stream)) {
guint8 tmp_buffer[4] = { 0 };
......@@ -1577,7 +1495,6 @@ extract_excel_content (GsfInfile *infile,
/* Read extended string */
xls_get_extended_record_string (stream,
list,
&n_words_remaining,
&n_bytes_remaining,
&content);
......@@ -1596,8 +1513,7 @@ extract_excel_content (GsfInfile *infile,
g_object_unref (stream);
g_debug ("Words normalized: %u, Bytes: %" G_GSIZE_FORMAT,
n_words - n_words_remaining,
g_debug ("Bytes extracted: %" G_GSIZE_FORMAT,
n_bytes - n_bytes_remaining);
return content ? g_string_free (content, FALSE) : NULL;
......@@ -1696,13 +1612,13 @@ extract_msoffice (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
TrackerConfig *config;
GFile *file = NULL;
GFileInfo *file_info = NULL;
const gchar *mime_used;
GsfInfile *infile = NULL;
gchar *content = NULL;
gboolean is_encrypted = FALSE;
gint max_words;
gsize max_bytes;
file = g_file_new_for_uri (uri);
......@@ -1738,23 +1654,19 @@ extract_msoffice (const gchar *uri,
mime_used = g_file_info_get_content_type (file_info);
/* Set max words to read from content */
max_words = fts_max_words ();
/* Set max bytes to read from content.
* Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
* points are really pretty rare */
max_bytes = 3 * max_words * fts_max_word_length ();
/* Set max bytes to read from content */
config = tracker_main_get_config ();
max_bytes = tracker_config_get_max_bytes (config);
if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) {
/* Word file */
content = extract_msword_content (infile, max_words, max_bytes, &is_encrypted);
content = extract_msword_content (infile, max_bytes, &is_encrypted);
} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) {
/* PowerPoint file */
content = extract_powerpoint_content (infile, max_words, max_bytes, &is_encrypted);
content = extract_powerpoint_content (infile, max_bytes, &is_encrypted);
} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
/* Excel File */
content = extract_excel_content (infile, max_words, max_bytes, &is_encrypted);
content = extract_excel_content (infile, max_bytes, &is_encrypted);
} else {
g_message ("Mime type was not recognised:'%s'", mime_used);
}
......@@ -1943,20 +1855,21 @@ xml_text_handler_document_data (GMarkupParseContext *context,
MsOfficeXMLParserInfo *info = user_data;
static gboolean found = FALSE;
static gboolean added = FALSE;
guint min_word_length = fts_min_word_length();
switch (info->tag_type) {
case MS_OFFICE_XML_TAG_WORD_TEXT:
if (info->style_element_present) {
if (atoi (text) == 0) {
g_string_append_printf (info->content, "%s ", text);
tracker_text_validate_utf8 (text, -1, &info->content);
g_string_append_c (info->content, ' ');
}
}
if (info->preserve_attribute_present) {
gchar *keywords = g_strdup (text);
if (found && (strlen (keywords) >= min_word_length)) {
g_string_append_printf (info->content, "%s ", text);
if (found) {
tracker_text_validate_utf8 (text, -1, &info->content);
g_string_append_c (info->content, ' ');
found = FALSE;
} else {
gchar *lasts;
......@@ -1979,14 +1892,14 @@ xml_text_handler_document_data (GMarkupParseContext *context,
break;
case MS_OFFICE_XML_TAG_SLIDE_TEXT:
if (strlen (text) > min_word_length) {
g_string_append_printf (info->content, "%s ", text);
}
tracker_text_validate_utf8 (text, -1, &info->content);
g_string_append_c (info->content, ' ');
break;
case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
if ((atoi (text) == 0) && (strlen (text) > min_word_length)) {
g_string_append_printf (info->content, "%s ", text);
if (atoi (text) == 0) {
tracker_text_validate_utf8 (text, -1, &info->content);
g_string_append_c (info->content, ' ');
}
break;
......
......@@ -74,7 +74,6 @@ static TrackerExtractData extract_data[] = {
static gchar *
extract_oasis_content (const gchar *uri,
guint n_words,
gsize n_bytes)
{
const gchar *argv[4];
......@@ -93,9 +92,9 @@ extract_oasis_content (const gchar *uri,
argv[2] = path;
argv[3] = NULL;
g_debug ("Executing command:'%s %s %s' (max words: %u, "
"max_bytes: %" G_GSIZE_FORMAT ")",
argv[0], argv[1], argv[2], n_words, n_bytes);
g_debug ("Executing command:'%s %s %s' "
"(max_bytes: %" G_GSIZE_FORMAT ")",
argv[0], argv[1], argv[2], n_bytes);
/* Fork & spawn */
if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
......@@ -122,50 +121,38 @@ extract_oasis_content (const gchar *uri,
/* Start buffered reading... */
else {
unsigned char buf[ODT_BUFFER_SIZE];
size_t r, accum;
guint n_words_remaining = n_words;
GString *normalized;
size_t r, bytes_remaining;
GString *validated = NULL;
accum = 0;
normalized = g_string_new ("");
bytes_remaining = n_bytes;
/* Reading in chunks of ODT_BUFFER_SIZE -1 (8192)
* Loop is halted whenever one of this conditions is met:
* a) Read bytes reached the maximum allowed (n_bytes)
* b) Already read up to the max number of words configured
* c) No more bytes to read
* b) No more bytes to read
*/
while ((accum <= n_bytes) &&
(n_words_remaining > 0) &&
while ((bytes_remaining > 0) &&
(r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) {
gchar *normalized_chunk;
guint n_words_normalized;
/* Always make sure that the read string will be
* NIL-terminated */
buf[r] = '\0';
/* Get normalized chunk */
normalized_chunk = tracker_text_normalize (buf,
n_words_remaining,
&n_words_normalized);
/* Update number of words remaining.
* Note that n_words_normalized should always be less or
* equal than n_words_remaining */
n_words_remaining = (n_words_normalized <= n_words_remaining ?