vteunistr.cc 6.68 KB
Newer Older
1 2 3
/*
 * Copyright (C) 2008 Red Hat, Inc.
 *
4 5 6 7
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
8
 *
9 10
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14 15 16
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
17 18 19 20 21 22 23
 *
 * Author(s):
 * 	Behdad Esfahbod
 */

#include <config.h>

24 25
#include "vteunistr.h"

26 27 28
#include <string.h>


Behdad Esfahbod's avatar
Behdad Esfahbod committed
29 30 31
/* Overview:
 *
 * The way vteunistr is implemented is very simple: Unicode only defines
Behdad Esfahbod's avatar
Behdad Esfahbod committed
32
 * codepoints less than 0x110000.  That leaves plenty of room in a guint32 to
Behdad Esfahbod's avatar
Behdad Esfahbod committed
33 34
 * use for other things.  So, whenever our "string" contains only one Unicode
 * character, we use its code as our vteunistr.  Otherwise, we register the
Behdad Esfahbod's avatar
Behdad Esfahbod committed
35 36
 * string in a central registry and assign a unique number to it and use that.
 * This number is "our own private internal non-unicode code for this
Behdad Esfahbod's avatar
Behdad Esfahbod committed
37 38 39 40
 * sequence of characters".
 *
 * The rest of the problem would be how to efficiently implement this
 * registry.  It does *NOT* really have to be efficient at all, as it will
Behdad Esfahbod's avatar
Behdad Esfahbod committed
41 42 43
 * only be accessed in case of combining marks.  And the strings are pretty
 * short (two or three characters).  But our implementation is quite efficient
 * anyway.
Behdad Esfahbod's avatar
Behdad Esfahbod committed
44 45 46 47 48 49 50 51 52
 *
 * The access pattern of using vteunistr's is that we have a vteunistr in a
 * terminal cell, a new gunichar comes in and we decide to combine with it,
 * and we combine them and get a new vteunistr.  So, that is exactly how we
 * encode vteunistr's: all we need to know about a vteunistr to be able to
 * reconstruct its string is the vteunistr and the gunichar that joined to
 * form it.  That's what VteUnistrDecomp is.  That is the decomposition.
 *
 * We start giving new vteunistr's unique numbers starting at
Behdad Esfahbod's avatar
Behdad Esfahbod committed
53
 * %VTE_UNISTR_START+1 and going up.  We keep the decompositions in a GArray,
Behdad Esfahbod's avatar
Behdad Esfahbod committed
54
 * called unistr_decomp.  The first entry of the array is unused (that's why
Behdad Esfahbod's avatar
Behdad Esfahbod committed
55
 * we start from %VTE_UNISTR_START plus one).  The decomposition table provides
Behdad Esfahbod's avatar
Behdad Esfahbod committed
56 57 58 59 60 61 62 63 64
 * enough information to efficiently answer questions like "what's the first
 * gunichar in this vteunistr?", "what's the sequence of gunichar's in this
 * vteunistr?", and "how many gunichar's are there in this vteunistr?".
 *
 * We still do not have any efficient way to construct new vteunistr's though.
 * Given a vteunistr and a gunichar, we have to walk over the entire
 * decomposition table to see if we have already registered (encoded) this
 * combination.  To make that operation fast, we use a reverse map, that is,
 * a GHashTable named unistr_comp.  The hash table maps a decomposition to its
Behdad Esfahbod's avatar
Behdad Esfahbod committed
65
 * encoded vteunistr value.  The value obviously fits in a pointer and does
Behdad Esfahbod's avatar
Behdad Esfahbod committed
66
 * not need memory allocation.  We also want to avoid allocating memory for
Behdad Esfahbod's avatar
Behdad Esfahbod committed
67
 * the key of the hash table entries, as we already have those decompositions
Behdad Esfahbod's avatar
Behdad Esfahbod committed
68
 * in the memory in the unistr_decomp array.  We cannot use direct pointers
Behdad Esfahbod's avatar
Behdad Esfahbod committed
69
 * though as when growing, the GArray may resize and move to a new memory
Behdad Esfahbod's avatar
Behdad Esfahbod committed
70 71 72 73 74
 * buffer, rendering all our pointers invalid.  For this reason, we keep the
 * index into the array as our hash table key.  When we want to perform a
 * lookup in the hash table, we insert the decomposition that we are searching
 * for as item zero in the unistr_decomp table, then lookup for an equal entry
 * of it in the hash table.  Finally, if the hash lookup fails, we add the new
Behdad Esfahbod's avatar
Behdad Esfahbod committed
75
 * decomposition to the lookup array and the hash table, and return the newly
Behdad Esfahbod's avatar
Behdad Esfahbod committed
76 77
 * encoded vteunistr value.
 */
78

Behdad Esfahbod's avatar
Behdad Esfahbod committed
79
#define VTE_UNISTR_START 0x80000000
80 81 82 83 84 85 86 87 88 89 90

static vteunistr unistr_next = VTE_UNISTR_START + 1;

struct VteUnistrDecomp {
	vteunistr prefix;
	gunichar  suffix;
};

GArray     *unistr_decomp;
GHashTable *unistr_comp;

Behdad Esfahbod's avatar
Behdad Esfahbod committed
91 92 93
#define DECOMP_FROM_INDEX(i)	g_array_index (unistr_decomp, struct VteUnistrDecomp, (i))
#define DECOMP_FROM_UNISTR(s)	DECOMP_FROM_INDEX ((s) - VTE_UNISTR_START)

94 95 96 97
static guint
unistr_comp_hash (gconstpointer key)
{
	struct VteUnistrDecomp *decomp;
Behdad Esfahbod's avatar
Behdad Esfahbod committed
98
	decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
99 100 101 102
	return decomp->prefix ^ decomp->suffix;
}

static gboolean
Behdad Esfahbod's avatar
Behdad Esfahbod committed
103
unistr_comp_equal (gconstpointer a, gconstpointer b)
104
{
Behdad Esfahbod's avatar
Behdad Esfahbod committed
105 106
	return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
			    &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
107 108 109 110 111 112 113 114 115 116 117 118 119
			    sizeof (struct VteUnistrDecomp));
}

vteunistr
_vte_unistr_append_unichar (vteunistr s, gunichar c)
{
	struct VteUnistrDecomp decomp;
	vteunistr ret = 0;

	decomp.prefix = s;
	decomp.suffix = c;

	if (G_UNLIKELY (!unistr_decomp)) {
Behdad Esfahbod's avatar
Behdad Esfahbod committed
120
		unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct VteUnistrDecomp));
121
		g_array_set_size (unistr_decomp, 1);
Behdad Esfahbod's avatar
Behdad Esfahbod committed
122
		unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
123
	} else {
Behdad Esfahbod's avatar
Behdad Esfahbod committed
124 125
		DECOMP_FROM_INDEX (0) = decomp;
		ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
126 127 128
	}

	if (G_UNLIKELY (!ret)) {
129 130 131 132
		/* sanity check to avoid OOM */
		if (G_UNLIKELY (_vte_unistr_strlen (s) > 10 || unistr_next - VTE_UNISTR_START > 100000))
			return s;

133 134 135 136 137 138 139 140 141 142
		ret = unistr_next++;
		g_array_append_val (unistr_decomp, decomp);
		g_hash_table_insert (unistr_comp,
				     GUINT_TO_POINTER (ret - VTE_UNISTR_START),
				     GUINT_TO_POINTER (ret));
	}

	return ret;
}

143 144 145 146 147 148 149 150 151 152 153 154 155
vteunistr
_vte_unistr_append_unistr (vteunistr s, vteunistr t)
{
        g_return_val_if_fail (s < unistr_next, s);
        g_return_val_if_fail (t < unistr_next, s);
        if (G_UNLIKELY (t >= VTE_UNISTR_START)) {
                s = _vte_unistr_append_unistr (s, DECOMP_FROM_UNISTR (t).prefix);
                return _vte_unistr_append_unichar (s, DECOMP_FROM_UNISTR (t).suffix);
        } else {
                return _vte_unistr_append_unichar (s, t);
        }
}

156 157 158 159 160
gunichar
_vte_unistr_get_base (vteunistr s)
{
	g_return_val_if_fail (s < unistr_next, s);
	while (G_UNLIKELY (s >= VTE_UNISTR_START))
Behdad Esfahbod's avatar
Behdad Esfahbod committed
161
		s = DECOMP_FROM_UNISTR (s).prefix;
162 163 164 165 166 167 168 169 170
	return (gunichar) s;
}

void
_vte_unistr_append_to_string (vteunistr s, GString *gs)
{
	g_return_if_fail (s < unistr_next);
	if (G_UNLIKELY (s >= VTE_UNISTR_START)) {
		struct VteUnistrDecomp *decomp;
Behdad Esfahbod's avatar
Behdad Esfahbod committed
171
		decomp = &DECOMP_FROM_UNISTR (s);
172 173 174 175 176
		_vte_unistr_append_to_string (decomp->prefix, gs);
		s = decomp->suffix;
	}
	g_string_append_unichar (gs, (gunichar) s);
}
Behdad Esfahbod's avatar
Behdad Esfahbod committed
177

178
int
Behdad Esfahbod's avatar
Behdad Esfahbod committed
179 180 181 182 183 184 185 186 187 188
_vte_unistr_strlen (vteunistr s)
{
	int len = 1;
	g_return_val_if_fail (s < unistr_next, len);
	while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
		s = DECOMP_FROM_UNISTR (s).prefix;
		len++;
	}
	return len;
}