Commit 68e78574 authored by Matthias Clasen's avatar Matthias Clasen

Add Unicode script support

parent 9c19905b
2006-10-08 Matthias Clasen <mclasen@redhat.com>
Add a way to obtain Unicode script information. (#348348,
Marco Barisione)
* glib/glib.symbols:
* glib/gunicode.h: Add GUnicodeScript enumeration and
g_unichar_get_script.
* glib/guniprop.c: Implement g_unichar_get_script.
* glib/gscripttable.h: Generated private header containing
script tables.
* glib/gen-script-table.pl: Script to generate gscripttable.h.
* glib/Makefile.am: Update
2006-10-08 Matthias Clasen <mclasen@redhat.com>
* tests/run-markup-tests.sh: Small portability fix. (#347944,
......
2006-10-08 Matthias Clasen <mclasen@redhat.com>
* glib/glib-sections.txt: Add g_unichar_get_script() and
GUnicodeScript.
* glib/tmpl/unicode.sgml: Document GUnicodeScript
* gobject/tmpl/enumerations_flags.sgml: Add a hint about
the requirement that enum and flags values must be static.
......
......@@ -2275,6 +2275,8 @@ g_unichar_break_type
g_unicode_canonical_ordering
g_unicode_canonical_decomposition
g_unichar_get_mirror_char
GUnicodeScript
g_unichar_get_script
<SUBSECTION>
g_utf8_next_char
......
......@@ -302,7 +302,6 @@ Applications should be ready to handle unknown values.
They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
>http://www.unicode.org/unicode/reports/tr14/</ulink>.
</para>
@G_UNICODE_BREAK_MANDATORY:
......@@ -380,6 +379,99 @@ See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
@Returns:
<!-- ##### ENUM GUnicodeScript ##### -->
<para>
The #GUnicodeScript enumeration identifies different writing
systems. The values correspond to the names as defined in the
Unicode standard. The enumeration has been added in GLib 2.14.
Note that new types may be added in the future. Applications
should be ready to handle unknown values.
See <ulink
url="http://www.unicode.org/reports/tr24/">Unicode Standard Annex
#24: Script names</ulink>.
</para>
@G_UNICODE_SCRIPT_INVALID_CODE: a value never returned from g_unichar_get_script()
@G_UNICODE_SCRIPT_COMMON: a character used by multiple different scripts
@G_UNICODE_SCRIPT_INHERITED: a mark glyph that takes its script from the
base glyph to which it is attached
@G_UNICODE_SCRIPT_ARABIC: Arabic
@G_UNICODE_SCRIPT_ARMENIAN: Armenian
@G_UNICODE_SCRIPT_BENGALI: Bengali
@G_UNICODE_SCRIPT_BOPOMOFO: Bopomofo
@G_UNICODE_SCRIPT_CHEROKEE: Cherokee
@G_UNICODE_SCRIPT_COPTIC: Coptic
@G_UNICODE_SCRIPT_CYRILLIC: Cyrillic
@G_UNICODE_SCRIPT_DESERET: Deseret
@G_UNICODE_SCRIPT_DEVANAGARI: Devanagari
@G_UNICODE_SCRIPT_ETHIOPIC: Ethiopic
@G_UNICODE_SCRIPT_GEORGIAN: Georgian
@G_UNICODE_SCRIPT_GOTHIC: Gothic
@G_UNICODE_SCRIPT_GREEK: Greek
@G_UNICODE_SCRIPT_GUJARATI: Gujarati
@G_UNICODE_SCRIPT_GURMUKHI: Gurmukhi
@G_UNICODE_SCRIPT_HAN: Han
@G_UNICODE_SCRIPT_HANGUL: Hangul
@G_UNICODE_SCRIPT_HEBREW: Hebrew
@G_UNICODE_SCRIPT_HIRAGANA: Hiragana
@G_UNICODE_SCRIPT_KANNADA: Kannada
@G_UNICODE_SCRIPT_KATAKANA: Katakana
@G_UNICODE_SCRIPT_KHMER: Khmer
@G_UNICODE_SCRIPT_LAO: Lao
@G_UNICODE_SCRIPT_LATIN: Latin
@G_UNICODE_SCRIPT_MALAYALAM: Malayalam
@G_UNICODE_SCRIPT_MONGOLIAN: Mongolian
@G_UNICODE_SCRIPT_MYANMAR: Myanmar
@G_UNICODE_SCRIPT_OGHAM: Ogham
@G_UNICODE_SCRIPT_OLD_ITALIC: Old Italic
@G_UNICODE_SCRIPT_ORIYA: Oriya
@G_UNICODE_SCRIPT_RUNIC: Runic
@G_UNICODE_SCRIPT_SINHALA: Sinhala
@G_UNICODE_SCRIPT_SYRIAC: Syriac
@G_UNICODE_SCRIPT_TAMIL: Tamil
@G_UNICODE_SCRIPT_TELUGU: Telugu
@G_UNICODE_SCRIPT_THAANA: Thaana
@G_UNICODE_SCRIPT_THAI: Thai
@G_UNICODE_SCRIPT_TIBETAN: Tibetan
@G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL:
Canadian Aboriginal
@G_UNICODE_SCRIPT_YI: Yi
@G_UNICODE_SCRIPT_TAGALOG: Tagalog
@G_UNICODE_SCRIPT_HANUNOO: Hanunoo
@G_UNICODE_SCRIPT_BUHID: Buhid
@G_UNICODE_SCRIPT_TAGBANWA: Tagbanwa
@G_UNICODE_SCRIPT_BRAILLE: Braille
@G_UNICODE_SCRIPT_CYPRIOT: Cypriot
@G_UNICODE_SCRIPT_LIMBU: Limbu
@G_UNICODE_SCRIPT_OSMANYA: Osmanya
@G_UNICODE_SCRIPT_SHAVIAN: Shavian
@G_UNICODE_SCRIPT_LINEAR_B: Linear B
@G_UNICODE_SCRIPT_TAI_LE: Tai Le
@G_UNICODE_SCRIPT_UGARITIC: Ugaritic
@G_UNICODE_SCRIPT_NEW_TAI_LUE: New Tai Lue
@G_UNICODE_SCRIPT_BUGINESE: Buginese
@G_UNICODE_SCRIPT_GLAGOLITIC: Glagolitic
@G_UNICODE_SCRIPT_TIFINAGH: Tifinagh
@G_UNICODE_SCRIPT_SYLOTI_NAGRI: Syloti Nagri
@G_UNICODE_SCRIPT_OLD_PERSIAN: Old Persian
@G_UNICODE_SCRIPT_KHAROSHTHI: Kharoshthi
@G_UNICODE_SCRIPT_UNKNOWN: an unassigned code point
@G_UNICODE_SCRIPT_BALINESE: Balinese
@G_UNICODE_SCRIPT_CUNEIFORM: Cuneiform
@G_UNICODE_SCRIPT_PHOENICIAN: Phoenician
@G_UNICODE_SCRIPT_PHAGS_PA: Phags-pa
@G_UNICODE_SCRIPT_NKO: N'Ko
<!-- ##### FUNCTION g_unichar_get_script ##### -->
<para>
</para>
@ch:
@Returns:
<!-- ##### MACRO g_utf8_next_char ##### -->
<para>
Skips to the next character in a UTF-8 string. The string must be
......
#!/usr/bin/perl -w
#
# Script to convert http://www.unicode.org/Public/UNIDATA/Scripts.txt
# into a machine-readable table.
#
######################################################################
if (@ARGV != 1) {
die "Usage: gen-script-table.pl Scripts.txt > gscripttable.h\n";
}
open IN, $ARGV[0] || die "Cannot open $ARGV[0]: $!\n";
my @ranges;
my $file;
my $easy_range;
my $i;
my $start;
my $end;
my $script;
while (<IN>) {
if (/^\#\s+(Scripts-.*.txt)/) {
$file = $1;
}
s/#.*//;
next if /^\s*$/;
if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) {
die "Cannot parse line: '$_'\n";
}
if (defined $2) {
push @ranges, [ hex $1, hex $2, uc $3 ];
} else {
push @ranges, [ hex $1, hex $1, uc $3 ];
}
}
@ranges = sort { $a->[0] <=> $b->[0] } @ranges;
$date = gmtime;
print <<"EOT";
/* gscripttable.h: Generated by gen-script-table.pl
*
* Date: $date
* Source: $file
*
* Do not edit.
*/
EOT
$easy_range = 0x2000;
print <<"EOT";
#define G_EASY_SCRIPTS_RANGE $easy_range
static const guchar g_script_easy_table[$easy_range] = {
EOT
$i = 0;
$end = -1;
for (my $c = 0; $c < $easy_range; $c++) {
if ($c % 3 == 0) {
printf "\n ";
}
if ($c > $end) {
$start = $ranges[$i]->[0];
$end = $ranges[$i]->[1];
$script = $ranges[$i]->[2];
$i++;
}
if ($c < $start) {
printf " G_SCRIPT_UNKNOWN,";
} else {
printf " G_SCRIPT_%s,", $script;
}
}
if ($end >= $easy_range) {
$i--;
$ranges[$i]->[0] = $easy_range;
}
print <<"EOT";
};
static const struct {
gunichar start;
guint16 chars;
guint16 script;
} g_script_table[] = {
EOT
for (; $i <= $#ranges; $i++) {
$start = $ranges[$i]->[0];
$end = $ranges[$i]->[1];
$script = $ranges[$i]->[2];
while ($i <= $#ranges - 1 &&
$ranges[$i + 1]->[0] == $end + 1 &&
$ranges[$i + 1]->[2] eq $script) {
$i++;
$end = $ranges[$i]->[1];
}
printf " { %#06x, %5d, G_SCRIPT_%s },\n", $start, $end - $start + 1, $script;
}
printf "};\n";
......@@ -1256,6 +1256,7 @@ g_unichar_tolower G_GNUC_CONST
g_unichar_totitle G_GNUC_CONST
g_unichar_toupper G_GNUC_CONST
g_unichar_get_mirror_char
g_unichar_get_script
g_unichar_digit_value G_GNUC_CONST
g_unichar_xdigit_value G_GNUC_CONST
g_unichar_type G_GNUC_CONST
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -112,6 +112,84 @@ typedef enum
G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
} GUnicodeBreakType;
typedef enum
{ /* ISO 15924 code */
G_UNICODE_SCRIPT_INVALID_CODE = -1,
G_UNICODE_SCRIPT_COMMON = 0, /* Zyyy */
G_UNICODE_SCRIPT_INHERITED, /* Qaai */
G_UNICODE_SCRIPT_ARABIC, /* Arab */
G_UNICODE_SCRIPT_ARMENIAN, /* Armn */
G_UNICODE_SCRIPT_BENGALI, /* Beng */
G_UNICODE_SCRIPT_BOPOMOFO, /* Bopo */
G_UNICODE_SCRIPT_CHEROKEE, /* Cher */
G_UNICODE_SCRIPT_COPTIC, /* Qaac */
G_UNICODE_SCRIPT_CYRILLIC, /* Cyrl (Cyrs) */
G_UNICODE_SCRIPT_DESERET, /* Dsrt */
G_UNICODE_SCRIPT_DEVANAGARI, /* Deva */
G_UNICODE_SCRIPT_ETHIOPIC, /* Ethi */
G_UNICODE_SCRIPT_GEORGIAN, /* Geor (Geon, Geoa) */
G_UNICODE_SCRIPT_GOTHIC, /* Goth */
G_UNICODE_SCRIPT_GREEK, /* Grek */
G_UNICODE_SCRIPT_GUJARATI, /* Gujr */
G_UNICODE_SCRIPT_GURMUKHI, /* Guru */
G_UNICODE_SCRIPT_HAN, /* Hani */
G_UNICODE_SCRIPT_HANGUL, /* Hang */
G_UNICODE_SCRIPT_HEBREW, /* Hebr */
G_UNICODE_SCRIPT_HIRAGANA, /* Hira */
G_UNICODE_SCRIPT_KANNADA, /* Knda */
G_UNICODE_SCRIPT_KATAKANA, /* Kana */
G_UNICODE_SCRIPT_KHMER, /* Khmr */
G_UNICODE_SCRIPT_LAO, /* Laoo */
G_UNICODE_SCRIPT_LATIN, /* Latn (Latf, Latg) */
G_UNICODE_SCRIPT_MALAYALAM, /* Mlym */
G_UNICODE_SCRIPT_MONGOLIAN, /* Mong */
G_UNICODE_SCRIPT_MYANMAR, /* Mymr */
G_UNICODE_SCRIPT_OGHAM, /* Ogam */
G_UNICODE_SCRIPT_OLD_ITALIC, /* Ital */
G_UNICODE_SCRIPT_ORIYA, /* Orya */
G_UNICODE_SCRIPT_RUNIC, /* Runr */
G_UNICODE_SCRIPT_SINHALA, /* Sinh */
G_UNICODE_SCRIPT_SYRIAC, /* Syrc (Syrj, Syrn, Syre) */
G_UNICODE_SCRIPT_TAMIL, /* Taml */
G_UNICODE_SCRIPT_TELUGU, /* Telu */
G_UNICODE_SCRIPT_THAANA, /* Thaa */
G_UNICODE_SCRIPT_THAI, /* Thai */
G_UNICODE_SCRIPT_TIBETAN, /* Tibt */
G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, /* Cans */
G_UNICODE_SCRIPT_YI, /* Yiii */
G_UNICODE_SCRIPT_TAGALOG, /* Tglg */
G_UNICODE_SCRIPT_HANUNOO, /* Hano */
G_UNICODE_SCRIPT_BUHID, /* Buhd */
G_UNICODE_SCRIPT_TAGBANWA, /* Tagb */
/* Unicode-4.0 additions */
G_UNICODE_SCRIPT_BRAILLE, /* Brai */
G_UNICODE_SCRIPT_CYPRIOT, /* Cprt */
G_UNICODE_SCRIPT_LIMBU, /* Limb */
G_UNICODE_SCRIPT_OSMANYA, /* Osma */
G_UNICODE_SCRIPT_SHAVIAN, /* Shaw */
G_UNICODE_SCRIPT_LINEAR_B, /* Linb */
G_UNICODE_SCRIPT_TAI_LE, /* Tale */
G_UNICODE_SCRIPT_UGARITIC, /* Ugar */
/* Unicode-4.1 additions */
G_UNICODE_SCRIPT_NEW_TAI_LUE, /* Talu */
G_UNICODE_SCRIPT_BUGINESE, /* Bugi */
G_UNICODE_SCRIPT_GLAGOLITIC, /* Glag */
G_UNICODE_SCRIPT_TIFINAGH, /* Tfng */
G_UNICODE_SCRIPT_SYLOTI_NAGRI, /* Sylo */
G_UNICODE_SCRIPT_OLD_PERSIAN, /* Xpeo */
G_UNICODE_SCRIPT_KHAROSHTHI, /* Khar */
/* Unicode-5.0 additions */
G_UNICODE_SCRIPT_UNKNOWN, /* Zzzz */
G_UNICODE_SCRIPT_BALINESE, /* Bali */
G_UNICODE_SCRIPT_CUNEIFORM, /* Xsux */
G_UNICODE_SCRIPT_PHOENICIAN, /* Phnx */
G_UNICODE_SCRIPT_PHAGS_PA, /* Phag */
G_UNICODE_SCRIPT_NKO /* Nkoo */
} GUnicodeScript;
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
* not null, sets *CHARSET to the name of the current locale's
* charset. This value is statically allocated, and should be copied
......@@ -292,6 +370,9 @@ gchar *g_utf8_collate_key_for_filename (const gchar *str,
gboolean g_unichar_get_mirror_char (gunichar ch,
gunichar *mirrored_ch);
GUnicodeScript g_unichar_get_script (gunichar ch);
/* private */
gchar *_g_utf8_make_valid (const gchar *name);
......
......@@ -29,6 +29,7 @@
#include "glib.h"
#include "gunichartables.h"
#include "gmirroringtable.h"
#include "gscripttable.h"
#include "gunicodeprivate.h"
#include "galias.h"
......@@ -1183,5 +1184,55 @@ g_unichar_get_mirror_char (gunichar ch,
}
#define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2)
static inline GUnicodeScript
g_unichar_get_script_bsearch (gunichar ch)
{
int lower = 0;
int upper = G_N_ELEMENTS (g_script_table) - 1;
static int saved_mid = G_SCRIPT_TABLE_MIDPOINT;
int mid = saved_mid;
do
{
if (ch < g_script_table[mid].start)
upper = mid - 1;
else if (ch >= g_script_table[mid].start + g_script_table[mid].chars)
lower = mid + 1;
else
return g_script_table[saved_mid = mid].script;
mid = (lower + upper) / 2;
}
while (lower <= upper);
return G_UNICODE_SCRIPT_UNKNOWN;
}
/**
* g_unichar_get_script:
* @ch: a Unicode character
*
* Looks up the #GUnicodeScript for a particular character (as defined
* by Unicode Standard Annex #24). No check is made for @ch being a
* valid Unicode character; if you pass in invalid character, the
* result is undefined.
*
* Return value: the #GUnicodeScript for the character.
*
* Since: 2.14
*/
GUnicodeScript
g_unichar_get_script (gunichar ch)
{
if (ch < G_EASY_SCRIPTS_RANGE)
return g_script_easy_table[ch];
else
return g_unichar_get_script_bsearch (ch);
}
#define __G_UNIPROP_C__
#include "galiasdef.c"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment