vteunistr.c 6.06 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*
 * Copyright (C) 2008 Red Hat, Inc.
 *
 * This is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Author(s):
 * 	Behdad Esfahbod
 */

#include <config.h>

#include <string.h>

#include "vteunistr.h"

Behdad Esfahbod's avatar
Behdad Esfahbod committed
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
/* Overview:
 *
 * The way vteunistr is implemented is very simple: Unicode only defines
 * codepoints less than 0x10FFFF.  That leaves plenty of room in a guint32 to
 * use for other things.  So, whenever our "string" contains only one Unicode
 * character, we use its code as our vteunistr.  Otherwise, we register the
 * string in a central registry and use assign a unique number to it.  That
 * number can be thought as "our own private non-unicode code for this
 * sequence of characters".
 *
 * The rest of the problem would be how to efficiently implement this
 * registry.  It does *NOT* really have to be efficient at all, as it will
 * only be access in case of combining marks.  And the strings are pretty
 * short most of the time.  But our implementation is quite efficient and
 * nifty anyway.
 *
 * The access pattern of using vteunistr's is that we have a vteunistr in a
 * terminal cell, a new gunichar comes in and we decide to combine with it,
 * and we combine them and get a new vteunistr.  So, that is exactly how we
 * encode vteunistr's: all we need to know about a vteunistr to be able to
 * reconstruct its string is the vteunistr and the gunichar that joined to
 * form it.  That's what VteUnistrDecomp is.  That is the decomposition.
 *
 * We start giving new vteunistr's unique numbers starting at
 * VTE_UNISTR_START+1 and going up.  We keep the decompositions in a GArray,
 * called unistr_decomp.  The first entry of the array is unused (that's why
 * we start from VTE_UNISTR_START plus one).  The decomposition table provides
 * enough information to efficiently answer questions like "what's the first
 * gunichar in this vteunistr?", "what's the sequence of gunichar's in this
 * vteunistr?", and "how many gunichar's are there in this vteunistr?".
 *
 * We still do not have any efficient way to construct new vteunistr's though.
 * Given a vteunistr and a gunichar, we have to walk over the entire
 * decomposition table to see if we have already registered (encoded) this
 * combination.  To make that operation fast, we use a reverse map, that is,
 * a GHashTable named unistr_comp.  The hash table maps a decomposition to its
 * encoded vteunistr value.  The value obivously fits in a pointer and does
 * not need memory allocation.  We also want to avoid allocating memory for
 * the keys of the hash table entries, as we already have those decompositions
 * in the memory in the unistr_decomp array.  We cannot use direct pointers
 * though as when growing the GArray may resize and move to a new memory
 * buffer, rendering all our pointers invalid.  For this reason, we keep the
 * index into the array as our hash table key.  When we want to perform a
 * lookup in the hash table, we insert the decomposition that we are searching
 * for as item zero in the unistr_decomp table, then lookup for an equal entry
 * of it in the hash table.  Finally, if the hash lookup fails, we add the new
 * decomposition to the lookup array and the hash table and return the newly
 * encoded vteunistr value.
 */
77

Behdad Esfahbod's avatar
Behdad Esfahbod committed
78
#define VTE_UNISTR_START 0x80000000
79 80 81 82 83 84 85 86 87 88 89

static vteunistr unistr_next = VTE_UNISTR_START + 1;

struct VteUnistrDecomp {
	vteunistr prefix;
	gunichar  suffix;
};

GArray     *unistr_decomp;
GHashTable *unistr_comp;

Behdad Esfahbod's avatar
Behdad Esfahbod committed
90 91 92
#define DECOMP_FROM_INDEX(i)	g_array_index (unistr_decomp, struct VteUnistrDecomp, (i))
#define DECOMP_FROM_UNISTR(s)	DECOMP_FROM_INDEX ((s) - VTE_UNISTR_START)

93 94 95 96
static guint
unistr_comp_hash (gconstpointer key)
{
	struct VteUnistrDecomp *decomp;
Behdad Esfahbod's avatar
Behdad Esfahbod committed
97
	decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
98 99 100 101
	return decomp->prefix ^ decomp->suffix;
}

static gboolean
Behdad Esfahbod's avatar
Behdad Esfahbod committed
102
unistr_comp_equal (gconstpointer a, gconstpointer b)
103
{
Behdad Esfahbod's avatar
Behdad Esfahbod committed
104 105
	return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
			    &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
106 107 108 109 110 111 112 113 114 115 116 117 118
			    sizeof (struct VteUnistrDecomp));
}

vteunistr
_vte_unistr_append_unichar (vteunistr s, gunichar c)
{
	struct VteUnistrDecomp decomp;
	vteunistr ret = 0;

	decomp.prefix = s;
	decomp.suffix = c;

	if (G_UNLIKELY (!unistr_decomp)) {
Behdad Esfahbod's avatar
Behdad Esfahbod committed
119
		unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct VteUnistrDecomp));
120
		g_array_set_size (unistr_decomp, 1);
Behdad Esfahbod's avatar
Behdad Esfahbod committed
121
		unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
122
	} else {
Behdad Esfahbod's avatar
Behdad Esfahbod committed
123 124
		DECOMP_FROM_INDEX (0) = decomp;
		ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
	}

	if (G_UNLIKELY (!ret)) {
		ret = unistr_next++;
		g_array_append_val (unistr_decomp, decomp);
		g_hash_table_insert (unistr_comp,
				     GUINT_TO_POINTER (ret - VTE_UNISTR_START),
				     GUINT_TO_POINTER (ret));
	}

	return ret;
}

gunichar
_vte_unistr_get_base (vteunistr s)
{
	g_return_val_if_fail (s < unistr_next, s);
	while (G_UNLIKELY (s >= VTE_UNISTR_START))
Behdad Esfahbod's avatar
Behdad Esfahbod committed
143
		s = DECOMP_FROM_UNISTR (s).prefix;
144 145 146 147 148 149 150 151 152
	return (gunichar) s;
}

void
_vte_unistr_append_to_string (vteunistr s, GString *gs)
{
	g_return_if_fail (s < unistr_next);
	if (G_UNLIKELY (s >= VTE_UNISTR_START)) {
		struct VteUnistrDecomp *decomp;
Behdad Esfahbod's avatar
Behdad Esfahbod committed
153
		decomp = &DECOMP_FROM_UNISTR (s);
154 155 156 157 158
		_vte_unistr_append_to_string (decomp->prefix, gs);
		s = decomp->suffix;
	}
	g_string_append_unichar (gs, (gunichar) s);
}
Behdad Esfahbod's avatar
Behdad Esfahbod committed
159 160 161 162 163 164 165 166 167 168 169 170 171 172

#if 0 /* unused */
int
_vte_unistr_strlen (vteunistr s)
{
	int len = 1;
	g_return_val_if_fail (s < unistr_next, len);
	while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
		s = DECOMP_FROM_UNISTR (s).prefix;
		len++;
	}
	return len;
}
#endif