Commit d453dce1 authored by Behdad Esfahbod's avatar Behdad Esfahbod Committed by Behdad Esfahbod

Document vteunistr.

2008-12-12  Behdad Esfahbod  <behdad@gnome.org>

        * doc/reference/Makefile.am:
        * src/vteunistr.c (unistr_comp_hash), (unistr_comp_equal),
        (_vte_unistr_append_unichar), (_vte_unistr_get_base),
        (_vte_unistr_append_to_string), (_vte_unistr_strlen):
        * src/vteunistr.h:
        Document vteunistr.


svn path=/trunk/; revision=2344
parent beb61da0
2008-12-12 Behdad Esfahbod <behdad@gnome.org>
* doc/reference/Makefile.am:
* src/vteunistr.c (unistr_comp_hash), (unistr_comp_equal),
(_vte_unistr_append_unichar), (_vte_unistr_get_base),
(_vte_unistr_append_to_string), (_vte_unistr_strlen):
* src/vteunistr.h:
Document vteunistr.
2008-12-12 Behdad Esfahbod <behdad@gnome.org> 2008-12-12 Behdad Esfahbod <behdad@gnome.org>
* src/vteunistr.h: Minor cleanup. * src/vteunistr.h: Minor cleanup.
......
...@@ -47,6 +47,7 @@ IGNORE_HFILES= \ ...@@ -47,6 +47,7 @@ IGNORE_HFILES= \
vteskel.h \ vteskel.h \
vtetc.h \ vtetc.h \
vtetree.h \ vtetree.h \
vteunistr.h \
vtexft.h vtexft.h
......
...@@ -25,8 +25,57 @@ ...@@ -25,8 +25,57 @@
#include "vteunistr.h" #include "vteunistr.h"
#define VTE_UNISTR_START 0x80000000 /* Overview:
*
* The way vteunistr is implemented is very simple: Unicode only defines
* codepoints less than 0x10FFFF. That leaves plenty of room in a guint32 to
* use for other things. So, whenever our "string" contains only one Unicode
* character, we use its code as our vteunistr. Otherwise, we register the
* string in a central registry and use assign a unique number to it. That
* number can be thought as "our own private non-unicode code for this
* sequence of characters".
*
* The rest of the problem would be how to efficiently implement this
* registry. It does *NOT* really have to be efficient at all, as it will
* only be access in case of combining marks. And the strings are pretty
* short most of the time. But our implementation is quite efficient and
* nifty anyway.
*
* The access pattern of using vteunistr's is that we have a vteunistr in a
* terminal cell, a new gunichar comes in and we decide to combine with it,
* and we combine them and get a new vteunistr. So, that is exactly how we
* encode vteunistr's: all we need to know about a vteunistr to be able to
* reconstruct its string is the vteunistr and the gunichar that joined to
* form it. That's what VteUnistrDecomp is. That is the decomposition.
*
* We start giving new vteunistr's unique numbers starting at
* VTE_UNISTR_START+1 and going up. We keep the decompositions in a GArray,
* called unistr_decomp. The first entry of the array is unused (that's why
* we start from VTE_UNISTR_START plus one). The decomposition table provides
* enough information to efficiently answer questions like "what's the first
* gunichar in this vteunistr?", "what's the sequence of gunichar's in this
* vteunistr?", and "how many gunichar's are there in this vteunistr?".
*
* We still do not have any efficient way to construct new vteunistr's though.
* Given a vteunistr and a gunichar, we have to walk over the entire
* decomposition table to see if we have already registered (encoded) this
* combination. To make that operation fast, we use a reverse map, that is,
* a GHashTable named unistr_comp. The hash table maps a decomposition to its
* encoded vteunistr value. The value obivously fits in a pointer and does
* not need memory allocation. We also want to avoid allocating memory for
* the keys of the hash table entries, as we already have those decompositions
* in the memory in the unistr_decomp array. We cannot use direct pointers
* though as when growing the GArray may resize and move to a new memory
* buffer, rendering all our pointers invalid. For this reason, we keep the
* index into the array as our hash table key. When we want to perform a
* lookup in the hash table, we insert the decomposition that we are searching
* for as item zero in the unistr_decomp table, then lookup for an equal entry
* of it in the hash table. Finally, if the hash lookup fails, we add the new
* decomposition to the lookup array and the hash table and return the newly
* encoded vteunistr value.
*/
#define VTE_UNISTR_START 0x80000000
static vteunistr unistr_next = VTE_UNISTR_START + 1; static vteunistr unistr_next = VTE_UNISTR_START + 1;
...@@ -38,26 +87,22 @@ struct VteUnistrDecomp { ...@@ -38,26 +87,22 @@ struct VteUnistrDecomp {
GArray *unistr_decomp; GArray *unistr_decomp;
GHashTable *unistr_comp; GHashTable *unistr_comp;
#define DECOMP_FROM_INDEX(i) g_array_index (unistr_decomp, struct VteUnistrDecomp, (i))
#define DECOMP_FROM_UNISTR(s) DECOMP_FROM_INDEX ((s) - VTE_UNISTR_START)
static guint static guint
unistr_comp_hash (gconstpointer key) unistr_comp_hash (gconstpointer key)
{ {
struct VteUnistrDecomp *decomp; struct VteUnistrDecomp *decomp;
decomp = &g_array_index (unistr_decomp, decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
struct VteUnistrDecomp,
GPOINTER_TO_UINT (key));
return decomp->prefix ^ decomp->suffix; return decomp->prefix ^ decomp->suffix;
} }
static gboolean static gboolean
unistr_comp_equal (gconstpointer a, unistr_comp_equal (gconstpointer a, gconstpointer b)
gconstpointer b)
{ {
return 0 == memcmp (&g_array_index (unistr_decomp, return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
struct VteUnistrDecomp, &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
GPOINTER_TO_UINT (a)),
&g_array_index (unistr_decomp,
struct VteUnistrDecomp,
GPOINTER_TO_UINT (b)),
sizeof (struct VteUnistrDecomp)); sizeof (struct VteUnistrDecomp));
} }
...@@ -71,17 +116,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c) ...@@ -71,17 +116,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c)
decomp.suffix = c; decomp.suffix = c;
if (G_UNLIKELY (!unistr_decomp)) { if (G_UNLIKELY (!unistr_decomp)) {
unistr_decomp = g_array_new (FALSE, TRUE, unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct VteUnistrDecomp));
sizeof (struct VteUnistrDecomp));
g_array_set_size (unistr_decomp, 1); g_array_set_size (unistr_decomp, 1);
unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
unistr_comp_equal);
} else { } else {
g_array_index (unistr_decomp, DECOMP_FROM_INDEX (0) = decomp;
struct VteUnistrDecomp, ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
0) = decomp;
ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp,
GUINT_TO_POINTER (0)));
} }
if (G_UNLIKELY (!ret)) { if (G_UNLIKELY (!ret)) {
...@@ -95,30 +135,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c) ...@@ -95,30 +135,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c)
return ret; return ret;
} }
/* Unused
int
_vte_unistr_strlen (vteunistr s)
{
int len = 1;
g_return_val_if_fail (s < unistr_next, len);
while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
s = g_array_index (unistr_decomp,
struct VteUnistrDecomp,
s - VTE_UNISTR_START).prefix;
len++;
}
return len;
}
*/
gunichar gunichar
_vte_unistr_get_base (vteunistr s) _vte_unistr_get_base (vteunistr s)
{ {
g_return_val_if_fail (s < unistr_next, s); g_return_val_if_fail (s < unistr_next, s);
while (G_UNLIKELY (s >= VTE_UNISTR_START)) while (G_UNLIKELY (s >= VTE_UNISTR_START))
s = g_array_index (unistr_decomp, s = DECOMP_FROM_UNISTR (s).prefix;
struct VteUnistrDecomp,
s - VTE_UNISTR_START).prefix;
return (gunichar) s; return (gunichar) s;
} }
...@@ -128,11 +150,23 @@ _vte_unistr_append_to_string (vteunistr s, GString *gs) ...@@ -128,11 +150,23 @@ _vte_unistr_append_to_string (vteunistr s, GString *gs)
g_return_if_fail (s < unistr_next); g_return_if_fail (s < unistr_next);
if (G_UNLIKELY (s >= VTE_UNISTR_START)) { if (G_UNLIKELY (s >= VTE_UNISTR_START)) {
struct VteUnistrDecomp *decomp; struct VteUnistrDecomp *decomp;
decomp = &g_array_index (unistr_decomp, decomp = &DECOMP_FROM_UNISTR (s);
struct VteUnistrDecomp,
s - VTE_UNISTR_START);
_vte_unistr_append_to_string (decomp->prefix, gs); _vte_unistr_append_to_string (decomp->prefix, gs);
s = decomp->suffix; s = decomp->suffix;
} }
g_string_append_unichar (gs, (gunichar) s); g_string_append_unichar (gs, (gunichar) s);
} }
#if 0 /* unused */
int
_vte_unistr_strlen (vteunistr s)
{
int len = 1;
g_return_val_if_fail (s < unistr_next, len);
while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
s = DECOMP_FROM_UNISTR (s).prefix;
len++;
}
return len;
}
#endif
...@@ -26,22 +26,64 @@ ...@@ -26,22 +26,64 @@
G_BEGIN_DECLS G_BEGIN_DECLS
/**
* vteunistr:
*
* vteunistr is a gunichar-compatible way to store strings. A string
* consisting of a single unichar c is represented as the same value
* as c itself. In that sense, gunichars can be readily used as
* vteunistrs. Longer strings can be built by appending a unichar
* to an already existing string.
*
* vteunistr is essentially just a gunicode-compatible quark value.
* It can be used to store strings (of a base followed by combining
* characters) where the code was designed to only allow one character.
*
* Strings are internalized efficiently and never freed. No memory
* management of vteunistr values is needed.
**/
typedef guint32 vteunistr; typedef guint32 vteunistr;
/**
* _vte_unistr_append_unichar:
* @s: a #vteunistr
* @c: Unicode character to append to @s
*
* Creates a vteunistr value for the string @s followed by the
* character @c.
*
* Returns: the new #vteunistr value
**/
vteunistr vteunistr
_vte_unistr_append_unichar (vteunistr s, gunichar c); _vte_unistr_append_unichar (vteunistr s, gunichar c);
/* Unused
int
_vte_unistr_strlen (vteunistr s);
*/
gunichar gunichar
_vte_unistr_get_base (vteunistr s); _vte_unistr_get_base (vteunistr s);
/**
* _vte_unistr_append_to_string:
* @s: a #vteunistr
* @gs: a #GString to append @s to
*
* Appends @s to @gs. This is how one converts a #vteunistr to a
* traditional string.
**/
void void
_vte_unistr_append_to_string (vteunistr s, GString *gs); _vte_unistr_append_to_string (vteunistr s, GString *gs);
#if 0 /* unused */
/**
* _vte_unistr_strlen:
* @s: a #vteunistr
*
* Counts the number of character in @s.
*
* Returns: length of @s in characters.
**/
int
_vte_unistr_strlen (vteunistr s);
#endif
G_END_DECLS G_END_DECLS
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment