Commit 1c67e3e8 authored by Morten Welinder's avatar Morten Welinder

Applix: fix encoding and character escape problems.

Escaping and continuation lines didn't play well together.

Decoding was absolutely bogus even for the 8-bit days for which it was
written.  Assume ISO-8859-1.

There is some kind of longer encoding.  That we still don't handle.

http://www.vistasource.com/doc/applixware/wordstechref.pdf
parent 39ee25d4
......@@ -8,6 +8,7 @@ Morten:
* New NT_RADICAL function.
* Fix conditional style crash.
* Fix applix locale problem. [#362]
* Fix applix encoding and escape problems. [#363]
--------------------------------------------------------------------------
Gnumeric 1.12.43
......
2018-11-03 Morten Welinder <terra@gnome.org>
* applix-read.c (applix_get_line): Properly handle continuation
lines with escaped characters. Handle non-ASCII characters on the
assumption that they are ISO-8859-1. Fixes #363.
2018-11-01 Morten Welinder <terra@gnome.org>
* applix-read.c (applix_conventions_new): Set up separators so we
......
......@@ -78,6 +78,7 @@ typedef struct {
GSList *std_names, *real_names;
GnmConventions *convs;
GIConv converter;
} ApplixReadState;
/* #define NO_DEBUG_APPLIX */
......@@ -296,57 +297,81 @@ static unsigned char *
applix_get_line (ApplixReadState *state)
{
unsigned char *ptr, *end, *buf;
size_t len, skip = 0, offset = 0;
GString *line = g_string_new (NULL);
gboolean first = TRUE;
// Read line and continuation lines.
while (NULL != (ptr = gsf_input_textline_ascii_gets (state->input))) {
len = strlen (ptr);
size_t len = strlen (ptr);
// Clip at the state line length
size_t uselen = MIN (len, state->line_len);
if (first) {
first = FALSE;
g_string_append_len (line, ptr, uselen);
} else if (uselen > 0) {
// Drop initial space from continuation line
g_string_append_len (line, ptr + 1, uselen - 1);
}
/* Clip at the state line length */
if (len > state->line_len)
len = state->line_len;
if (len < state->line_len)
break;
}
if ((offset + len) > state->buffer_size) {
state->buffer_size += state->line_len;
state->buffer = g_realloc (state->buffer, state->buffer_size + 1);
}
if (line->len > state->buffer_size) {
state->buffer_size = line->len;
state->buffer = g_realloc (state->buffer, state->buffer_size + 1);
}
end = ptr + len;
ptr += skip;
buf = state->buffer + offset;
while (ptr < end) {
if (*ptr == '^') {
if (ptr [1] != '^') {
if (ptr [1] == '\0' || ptr [2] == '\0') {
applix_parse_error (state, _("Missing characters for character encoding"));
*(buf++) = *(ptr++);
} else if (ptr [1] < 'a' || ptr [1] > 'p' ||
ptr [2] < 'a' || ptr [2] > 'p') {
applix_parse_error (state, _("Invalid characters for encoding '%c%c'"),
ptr[1], ptr[2]);
*(buf++) = *(ptr++);
} else {
*(buf++) = ((ptr[1] - 'a') << 8) | (ptr[2] - 'a');
ptr += 3;
}
} else /* an encoded carat */
*(buf++) = '^', ptr += 2;
} else
*(buf++) = *(ptr++);
ptr = line->str;
end = ptr + line->len;
buf = state->buffer;
// g_printerr ("Pre [%s]\n", ptr);
while (ptr < end) {
if (*ptr != '^') {
*(buf++) = *(ptr++);
continue;
}
offset = buf - state->buffer;
if (ptr[1] == '^') {
// An encoded carat
*(buf++) = '^', ptr += 2;
continue;
}
if (len >= state->line_len)
skip = 1; /* skip the leading space for next line */
else
break;
if (ptr[1] == '\0' || ptr[2] == '\0') {
applix_parse_error (state, _("Missing characters for character encoding"));
*(buf++) = *(ptr++);
} else if (ptr[1] < 'a' || ptr[1] > 'p' ||
ptr[2] < 'a' || ptr[2] > 'p') {
applix_parse_error (state, _("Invalid characters for encoding '%c%c'"),
ptr[1], ptr[2]);
*(buf++) = *(ptr++);
} else {
guchar uc = ((ptr[1] - 'a') << 4) | (ptr[2] - 'a');
gsize utf8_len;
char *utf8buf = g_convert_with_iconv (&uc, 1, state->converter, NULL,
&utf8_len, NULL);
memcpy (buf, utf8buf, utf8_len);
buf += utf8_len;
g_free (utf8buf);
ptr += 3;
}
}
if (offset == 0 && ptr == NULL)
if (line->len == 0) {
g_string_free (line, TRUE);
return NULL;
}
if (buf)
*buf = 0;
g_string_free (line, TRUE);
if (state->buffer != NULL)
state->buffer [offset] = '\0';
//g_printerr ("Post: [%s]\n", state->buffer);
return state->buffer;
}
......@@ -885,7 +910,7 @@ applix_read_attributes (ApplixReadState *state)
if (!a_strncmp (ptr, "Attr Table End"))
return FALSE;
if (ptr [0] != '<')
if (ptr[0] != '<')
return applix_parse_error (state, "Invalid attribute");
/* TODO : The first style seems to be a different format */
......@@ -1659,6 +1684,7 @@ applix_read (GOIOContext *io_context, WorkbookView *wb_view, GsfInput *src)
state.std_names = NULL;
state.real_names = NULL;
state.convs = applix_conventions_new ();
state.converter = g_iconv_open ("UTF-8", "ISO-8859-1");
/* Actually read the workbook */
res = applix_read_impl (&state);
......@@ -1708,4 +1734,5 @@ applix_read (GOIOContext *io_context, WorkbookView *wb_view, GsfInput *src)
go_io_error_info_set (io_context, state.parse_error);
gnm_conventions_unref (state.convs);
gsf_iconv_close (state.converter);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment