Commit d995b503 authored by Morten Welinder's avatar Morten Welinder Committed by Morten Welinder

New function.

2003-06-16  Morten Welinder  <terra@gnome.org>

	* src/gutils.c (gnm_guess_encoding): New function.

	* src/stf.c (stf_read_workbook_auto_csvtab): Properly decode file
	before inspecting.  Use stf_parse_options_guess.

	* src/stf-parse.c (stf_parse_options_guess): New function.
parent aa6052ca
......@@ -187,7 +187,7 @@ Pending Patches
2.8) Make listed separators locale dependent
2.9) User specifiable locale encoding (not to be confused
with character encoding)
2.10) Fix stf bugs
2.10) Fix stf bugs (DONE)
2.11) Improve initial file preview (DONE)
2.12) Hook up workbook-control-gui.c:cb_data_import_text
Data -> External -> Import _Text File...
......
2003-06-16 Morten Welinder <terra@gnome.org>
* src/gutils.c (gnm_guess_encoding): New function.
* src/stf.c (stf_read_workbook_auto_csvtab): Properly decode file
before inspecting. Use stf_parse_options_guess.
* src/stf-parse.c (stf_parse_options_guess): New function.
* src/parse-util.c (sheetref_parse): Base on gnm_strunescape like
parser.y's equivalent code. Avoid unbounded stack usage.
......
2003-06-16 Morten Welinder <terra@gnome.org>
* src/gutils.c (gnm_guess_encoding): New function.
* src/stf.c (stf_read_workbook_auto_csvtab): Properly decode file
before inspecting. Use stf_parse_options_guess.
* src/stf-parse.c (stf_parse_options_guess): New function.
* src/parse-util.c (sheetref_parse): Base on gnm_strunescape like
parser.y's equivalent code. Avoid unbounded stack usage.
......
2003-06-16 Morten Welinder <terra@gnome.org>
* src/gutils.c (gnm_guess_encoding): New function.
* src/stf.c (stf_read_workbook_auto_csvtab): Properly decode file
before inspecting. Use stf_parse_options_guess.
* src/stf-parse.c (stf_parse_options_guess): New function.
* src/parse-util.c (sheetref_parse): Base on gnm_strunescape like
parser.y's equivalent code. Avoid unbounded stack usage.
......
......@@ -870,3 +870,43 @@ g_str_compare (void const *x, void const *y)
return strcmp (x, y);
}
const char *
gnm_guess_encoding (const char *raw, size_t len, const char *user_guess,
char **utf8_str)
{
int try;
g_return_val_if_fail (raw != NULL, NULL);
for (try = 1; 1; try++) {
const char *guess;
GError *error = NULL;
char *utf8_data;
switch (try) {
case 1: guess = user_guess; break;
case 2: g_get_charset (&guess); break;
case 3: guess = "ASCII"; break;
case 4: guess = "ISO-8859-1"; break;
case 5: guess = "UTF-8"; break;
default: return NULL;
}
if (!guess)
continue;
utf8_data = g_convert (raw, len, "UTF-8", guess,
NULL, NULL, &error);
if (!error) {
if (utf8_str)
*utf8_str = utf8_data;
else
g_free (utf8_data);
return guess;
}
g_error_free (error);
}
}
......@@ -68,6 +68,10 @@ guint gnumeric_ascii_strcase_hash (gconstpointer v);
gint gnumeric_ascii_strcase_equal (gconstpointer v, gconstpointer v2);
gint gnumeric_utf8_collate_casefold (const char *a, const char *b);
const char *gnm_guess_encoding (const char *raw, size_t len,
const char *user_guess,
char **utf8_str);
/**
* System and user paths
*/
......
......@@ -39,11 +39,8 @@
#include "number-match.h"
#include "gutils.h"
#include "parse-util.h"
#include "format.h"
#include <ctype.h>
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
#include <stdlib.h>
#define WARN_TOO_MANY_ROWS _("Too many rows in data to parse: %d")
......@@ -304,7 +301,7 @@ stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions, char cons
parseoptions->sep.chr = g_strdup (character);
g_slist_free_custom (parseoptions->sep.str, g_free);
parseoptions->sep.str = g_slist_map (string, g_strdup);
parseoptions->sep.str = g_slist_map (string, (GnmMapFunc)g_strdup);
}
void
......@@ -334,7 +331,7 @@ stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoption
/**
* stf_parse_options_csv_set_duplicates:
* @duplicates : a boolean value indicating whether we want to see two
* separators right behind eachother as one
* separators right behind each other as one
**/
void
stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
......@@ -1264,3 +1261,80 @@ stf_parse_next_token (char const *data, StfParseOptions_t *parseoptions, StfToke
*tokentype = ttype;
return character;
}
static int
int_sort (const void *a, const void *b)
{
return *(const int *)a - *(const int *)b;
}
static int
count_character (GPtrArray *lines, gunichar c, double quantile)
{
int *counts, res;
unsigned int ui;
if (lines->len == 0)
return 0;
counts = g_new (int, lines->len);
for (ui = 0; ui < lines->len; ui++) {
int count = 0;
GPtrArray *boxline = g_ptr_array_index (lines, ui);
const char *line = g_ptr_array_index (boxline, 0);
while (*line) {
if (g_utf8_get_char (line) == c)
count++;
line = g_utf8_next_char (line);
}
counts[ui] = count;
}
qsort (counts, lines->len, sizeof (counts[0]), int_sort);
ui = (unsigned int)ceil (quantile * lines->len);
if (ui == lines->len)
ui--;
res = counts[ui];
g_free (counts);
return res;
}
StfParseOptions_t *
stf_parse_options_guess (const char *data)
{
StfParseOptions_t *res;
GPtrArray *lines;
int tabcount;
int sepcount;
gunichar sepchar = format_get_arg_sep ();
g_return_val_if_fail (data != NULL, NULL);
res = stf_parse_options_new ();
lines = stf_parse_lines (res, data, FALSE);
tabcount = count_character (lines, '\t', 0.2);
sepcount = count_character (lines, sepchar, 0.2);
/* At least one tab per line and enough to separate every
would-be sepchars. */
if (tabcount >= 1 && tabcount >= sepcount - 1)
stf_parse_options_csv_set_separators (res, "\t", NULL);
stf_parse_options_set_type (res, PARSE_TYPE_CSV);
stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
stf_parse_options_csv_set_duplicates (res, FALSE);
stf_parse_options_csv_set_stringindicator (res, '"');
stf_parse_general_free (lines);
return res;
}
......@@ -59,6 +59,8 @@ typedef struct {
StfParseOptions_t *stf_parse_options_new (void);
void stf_parse_options_free (StfParseOptions_t *parseoptions);
StfParseOptions_t *stf_parse_options_guess (const char *data);
/* MANIPULATION of stf options struct */
void stf_parse_options_set_type (StfParseOptions_t *parseoptions,
......
......@@ -319,76 +319,45 @@ stf_read_workbook_auto_csvtab (GnmFileOpener const *fo, gchar const *enc,
Sheet *sheet;
Workbook *book;
char *name;
char *data;
char *data, *utf8data;
size_t data_len;
StfParseOptions_t *po;
char const *pos;
unsigned int sep = 0, tab = 0, lines = 0;
int i;
gboolean last_was_newline = FALSE;
gunichar guni_tab = '\t';
gunichar guni_newline = '\n';
gunichar guni_carriage = '\r';
gunichar guni_sep = format_get_arg_sep ();
g_return_if_fail (context != NULL);
g_return_if_fail (wbv != NULL);
book = wb_view_workbook (wbv);
data = stf_preparse (COMMAND_CONTEXT (context), input, &data_len);
if (!data)
return;
po = stf_parse_options_new ();
stf_parse_options_set_type (po, PARSE_TYPE_CSV);
stf_parse_options_set_trim_spaces (po, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
stf_parse_options_csv_set_stringindicator (po, '"');
stf_parse_options_csv_set_indicator_2x_is_single (po, TRUE);
stf_parse_options_csv_set_duplicates (po, FALSE);
for (i = STF_PROBE_SIZE, pos = data ; pos && *pos && i-- > 0;
pos = stf_parse_next_token (pos, po, NULL)) {
gunichar this_char;
this_char = g_utf8_get_char (pos);
if (this_char == guni_sep) {
++sep;
last_was_newline = FALSE;
} else if (this_char == guni_tab) {
++tab;
last_was_newline = FALSE;
} else if ((this_char == guni_newline || this_char == guni_carriage)
&& !last_was_newline) {
++lines;
last_was_newline = TRUE;
}
enc = gnm_guess_encoding (data, data_len, enc, &utf8data);
g_free (data);
if (!enc) {
gnumeric_error_read (COMMAND_CONTEXT (context),
_("That file is not in the given encoding."));
return;
}
po = stf_parse_options_guess (utf8data);
name = g_path_get_basename (gsf_input_name (input));
sheet = sheet_new (book, name);
g_free (name);
workbook_sheet_attach (book, sheet, NULL);
/* Guess */
stf_parse_options_csv_set_separators (po, (guni_sep == ',') ? ",":";", NULL);
if (tab > lines || tab >= sep)
stf_parse_options_csv_set_separators (po, "\t", NULL);
if (!stf_parse_sheet (po, data, NULL, sheet, 0, 0)) {
if (stf_parse_sheet (po, utf8data, NULL, sheet, 0, 0)) {
workbook_recalc (book);
sheet_queue_respan (sheet, 0, SHEET_MAX_ROWS-1);
} else {
workbook_sheet_detach (book, sheet);
g_free (data);
stf_parse_options_free (po);
gnumeric_error_read (COMMAND_CONTEXT (context),
_("Parse error while trying to parse data into sheet"));
return;
}
stf_parse_options_free (po);
workbook_recalc (book);
sheet_queue_respan (sheet, 0, SHEET_MAX_ROWS-1);
g_free (name);
g_free (data);
stf_parse_options_free (po);
g_free (utf8data);
}
/***********************************************************************************/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment