Commit 0196d639 authored by Marco Barisione's avatar Marco Barisione Committed by Marco Barisione

Add GRegex for regular expression matching. (#50075)

2007-03-15  Marco Barisione <marco@barisione.org>

	Add GRegex for regular expression matching.  (#50075)

	* configure.in: Handle GRegex compilation.

	* glib/gregex.c:
	* glib/gregex.h: Code for GRegex.

	* glib/Makefile.am:
	* glib/makefile.msc.in: Updated makefiles.

	* glib/pcre/*: Internal copy of PCRE.

	* glib/update-pcre/*: Stuff to automatically update the internal PCRE
	to a newer version.

	* tests/regex-test.c:
	* tests/Makefile.am:
	* tests/makefile.msc.in: Add tests for GRegex.

svn path=/trunk/; revision=5408
parent af867179
2007-03-15 Marco Barisione <marco@barisione.org>
Add GRegex for regular expression matching. (#50075)
* configure.in: Handle GRegex compilation.
* glib/gregex.c:
* glib/gregex.h: Code for GRegex.
* glib/Makefile.am:
* glib/makefile.msc.in: Updated makefiles.
* glib/pcre/*: Internal copy of PCRE.
* glib/update-pcre/*: Stuff to automatically update the internal PCRE
to a newer version.
* tests/regex-test.c:
* tests/Makefile.am:
* tests/makefile.msc.in: Add tests for GRegex.
2007-03-15 Chris Wilson <chris@chris-wilson.co.uk>
* glib/gmain.c (g_main_dispatch): Replace a
......
......@@ -173,7 +173,7 @@ AM_CONDITIONAL(MS_LIB_AVAILABLE, [test x$ms_librarian = xyes])
if test "$glib_native_win32" != yes; then
# libtool option to control which symbols are exported
# right now, symbols starting with _ are not exported
LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^[[^_]].*"'
LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^g.*"'
else
# We currently use .def files on Windows
LIBTOOL_EXPORT_OPTIONS=
......@@ -2146,6 +2146,74 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([[
[broken_poll="no (cross compiling)"])
AC_MSG_RESULT($broken_poll)
dnl *********************
dnl *** GRegex checks ***
dnl *********************
PCRE_REQUIRED_VERSION=7.0
# Check if we should compile GRegex
AC_ARG_ENABLE(regex, AC_HELP_STRING([--disable-regex],
[disable the compilation of GRegex]),
[case "${enableval}" in
yes) enable_regex=true ;;
no) enable_regex=false ;;
*) AC_MSG_ERROR(bad value ${enableval} for --enable-regex) ;;
esac],
[enable_regex=true])
AM_CONDITIONAL(ENABLE_REGEX, $enable_regex)
if test x$enable_regex = xtrue; then
# Check if we should use the internal or the system-supplied pcre
AC_ARG_WITH(pcre,
[AC_HELP_STRING([--with-pcre=@<:@internal/system@:>@],
[specify whether to use the internal or the
system-supplied PCRE library])])
AM_CONDITIONAL(USE_SYSTEM_PCRE, [test "x$with_pcre" = xsystem])
if test "x$with_pcre" = xsystem; then
PKG_CHECK_MODULES(PCRE,
libpcre >= $PCRE_REQUIRED_VERSION)
AC_CACHE_CHECK([for Unicode support in PCRE],glib_cv_pcre_has_unicode,[
CFLAGS="$PCRE_CFLAGS" LDFLAGS="$PCRE_LIBS"
AC_TRY_RUN([#include <pcre.h>
int main () {
int support;
pcre_config (PCRE_CONFIG_UTF8, &support);
if (!support)
return 1;
pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
if (!support)
return 1;
return 0;
}],
glib_cv_pcre_has_unicode=yes,
glib_cv_pcre_has_unicode=no,
glib_cv_pcre_has_unicode=yes)])
if test "$glib_cv_pcre_has_unicode" = "no"; then
AC_MSG_ERROR([*** The system-supplied PCRE does not support Unicode properties or UTF-8.])
fi
AC_SUBST(PCRE_CFLAGS)
AC_SUBST(PCRE_LIBS)
AC_DEFINE(USE_SYSTEM_PCRE, [], [using the system-supplied PCRE library])
else
# If using gcc 4 pass -Wno-pointer-sign when compiling the internal PCRE
if test x"$GCC" = xyes; then
AC_MSG_CHECKING([whether gcc understands -Wno-pointer-sign])
if test [`$CC --version | sed -e 's/[^0-9]*\([0-9]\).*/\1/' -e q`] -ge 4; then
PCRE_WARN_CFLAGS="$PCRE_WARN_CFLAGS -Wno-pointer-sign"
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
fi
fi
fi
AC_SUBST(PCRE_WARN_CFLAGS)
else
AM_CONDITIONAL(USE_SYSTEM_PCRE, false])
fi
dnl **********************
dnl *** Win32 API libs ***
dnl **********************
......@@ -2864,6 +2932,8 @@ Makefile
glib/Makefile
glib/libcharset/Makefile
glib/gnulib/Makefile
glib/pcre/Makefile
glib/update-pcre/Makefile
gmodule/Makefile
gmodule/gmoduleconf.h
gobject/Makefile
......
2007-03-15 Marco Barisione <marco@barisione.org>
Add GRegex for regular expression matching. (#50075)
* glib/Makefile.am:
* glib/glib-docs.sgml:
* glib/glib-sections.txt:
* glib/tmpl/glib-unused.sgml:
* glib/regex-syntax.sgml:
* glib/tmpl/gregex-unused.sgml:
* glib/tmpl/gregex.sgml: Add GRegex.
* glib/building.sgml: Document build options for GRegex.
2007-03-14 Stefan Kost <ensonic@users.sf.net>
* gobject/tmpl/gparamspec.sgml:
......
......@@ -37,7 +37,9 @@ IGNORE_HFILES= \
gmirroringtable.h \
gscripttable.h \
glib-mirroring-tab \
gnulib
gnulib \
pcre \
update-pcre
# Extra options to supply to gtkdoc-mkdb
MKDB_OPTIONS=--sgml-mode --output-format=xml --ignore-files=trio
......@@ -55,6 +57,7 @@ content_files = \
changes.sgml \
compiling.sgml \
resources.sgml \
regex-syntax.sgml \
version.xml \
glib-gettextize.xml
......
......@@ -146,6 +146,16 @@ How to compile GLib itself
e.g. POSIX threads, DCE threads or Solaris threads.
</para>
</listitem>
<listitem>
<para>
GRegex uses the the <ulink url="http://www.pcre.org/">PCRE library</ulink>
for regular expression matching. The default is to use the internal
version of PCRE that is patched to use GLib for memory management
and Unicode handling. If you prefer to use the system-supplied PCRE
library you can pass the --with-pcre=system option to configure,
but it is not recommended.
</para>
</listitem>
</itemizedlist>
</refsect1>
......@@ -177,6 +187,13 @@ How to compile GLib itself
<group>
<arg>--with-threads=[none|posix|dce|win32]</arg>
</group>
<group>
<arg>--disable-regex</arg>
<arg>--enable-regex</arg>
</group>
<group>
<arg>--with-pcre=[internal|system]</arg>
</group>
<group>
<arg>--disable-included-printf</arg>
<arg>--enable-included-printf</arg>
......@@ -361,6 +378,61 @@ How to compile GLib itself
</para>
</formalpara>
<formalpara>
<title><systemitem>--disable-regex</systemitem> and
<systemitem>--enable-regex</systemitem></title>
<para>
Do not compile GLib with regular expression support.
GLib will be smaller because it will not need the
PCRE library. This is however not recommended, as
programs may need GRegex.
</para>
</formalpara>
<formalpara>
<title><systemitem>--with-pcre</systemitem></title>
<para>
Specify whether to use the internal or the system-supplied
PCRE library.
<itemizedlist>
<listitem><para>
'internal' means that GRegex will be compiled to use
the internal PCRE library.
</para></listitem>
<listitem><para>
'system' means that GRegex will be compiled to use
the system-supplied PCRE library.
</para></listitem>
</itemizedlist>
Using the internal PCRE is the preferred solution:
<itemizedlist>
<listitem>
<para>
System-supplied PCRE has a separated copy of the big tables
used for Unicode handling.
</para>
</listitem>
<listitem>
<para>
Some systems have PCRE libraries compiled without some needed
features, such as UTF-8 and Unicode support.
</para>
</listitem>
<listitem>
<para>
PCRE uses some global variables for memory management and
other features. In the rare case of a program using both
GRegex and PCRE (maybe indirectly through a library),
this variables could lead to problems when they are modified.
</para>
</listitem>
</itemizedlist>
</para>
</formalpara>
<formalpara>
<title><systemitem>--disable-included-printf</systemitem> and
<systemitem>--enable-included-printf</systemitem></title>
......
......@@ -61,6 +61,7 @@
<!ENTITY glib-Bookmarkfile SYSTEM "xml/bookmarkfile.xml">
<!ENTITY glib-Base64 SYSTEM "xml/base64.xml">
<!ENTITY glib-i18n SYSTEM "xml/i18n.xml">
<!ENTITY glib-Regex SYSTEM "xml/gregex.xml">
<!ENTITY glib-Version SYSTEM "xml/version.xml">
<!ENTITY glib-Compiling SYSTEM "compiling.sgml">
......@@ -69,6 +70,7 @@
<!ENTITY glib-Running SYSTEM "running.sgml">
<!ENTITY glib-Resources SYSTEM "resources.sgml">
<!ENTITY glib-Changes SYSTEM "changes.sgml">
<!ENTITY glib-RegexSyntax SYSTEM "regex-syntax.sgml">
<!ENTITY glib-gettextize SYSTEM "glib-gettextize.xml">
......@@ -101,6 +103,7 @@ synchronize their operation.
&glib-Compiling;
&glib-Running;
&glib-Changes;
&glib-RegexSyntax;
&glib-Resources;
</chapter>
......@@ -151,6 +154,7 @@ synchronize their operation.
&glib-Shell;
&glib-Option;
&glib-Pattern-Matching;
&glib-Regex;
&glib-Markup;
&glib-Keyfile;
&glib-Bookmarkfile;
......
......@@ -863,6 +863,50 @@ g_pattern_match_string
g_pattern_match_simple
</SECTION>
<SECTION>
<TITLE>Perl-compatible regular expressions</TITLE>
<FILE>gregex</FILE>
GRegexError
G_REGEX_ERROR
GRegexCompileFlags
GRegexMatchFlags
GRegex
GRegexEvalCallback
g_regex_new
g_regex_free
g_regex_optimize
g_regex_copy
g_regex_get_pattern
g_regex_clear
g_regex_match_simple
g_regex_match
g_regex_match_full
g_regex_match_next
g_regex_match_next_full
g_regex_match_all
g_regex_match_all_full
g_regex_get_match_count
g_regex_is_partial_match
g_regex_fetch
g_regex_fetch_pos
g_regex_fetch_named
g_regex_fetch_named_pos
g_regex_fetch_all
g_regex_get_string_number
g_regex_split_simple
g_regex_split
g_regex_split_full
g_regex_split_next
g_regex_split_next_full
g_regex_expand_references
g_regex_replace
g_regex_replace_literal
g_regex_replace_eval
g_regex_escape_string
<SUBSECTION Private>
g_regex_error_quark
</SECTION>
<SECTION>
<TITLE>Message Logging</TITLE>
<FILE>messages</FILE>
......
This diff is collapsed.
......@@ -712,6 +712,13 @@ To use this function you must configure glib with the flag
@mem: the memory to check.
<!-- ##### FUNCTION g_regex_error_quark ##### -->
<para>
</para>
@Returns:
<!-- ##### FUNCTION g_scanner_stat_mode ##### -->
<para>
Gets the file attributes.
......
<!-- ##### SECTION Title ##### -->
Perl-compatible regular expressions
<!-- ##### SECTION Short_Description ##### -->
matches strings against regular expressions.
<!-- ##### SECTION Long_Description ##### -->
<para>
The <function>g_regex_*()</function> functions implement regular
expression pattern matching using syntax and semantics similar to
Perl regular expression.
</para>
<para>
Some functions accept a <parameter>start_position</parameter> argument,
setting it differs from just passing over a shortened string and setting
#G_REGEX_MATCH_NOTBOL in the case of a pattern that begins with any kind
of lookbehind assertion.
For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
in the middle of words. ("\B" matches only if the current position in the
subject is not a word boundary.) When applied to the string "Mississipi"
from the fourth byte, namely "issipi", it does not match, because "\B" is
always false at the start of the subject, which is deemed to be a word
boundary. However, if the entire string is passed , but with
<parameter>start_position</parameter> set to 4, it finds the second
occurrence of "iss" because it is able to look behind the starting point
to discover that it is preceded by a letter.
</para>
<para>
Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
to these functions must be encoded in UTF-8. The lengths and the positions
inside the strings are in bytes and not in characters, so, for instance,
"\xc3\xa0" (i.e. "&agrave;") is two bytes long but it is treated as a single
character. If you set #G_REGEX_RAW the strings can be non-valid UTF-8
strings and a byte is treated as a character, so "\xc3\xa0" is two bytes
and two characters long.
</para>
<para>
When matching a pattern, "\n" matches only against a "\n" character in the
string, and "\r" matches only a "\r" character. To match any newline sequence
use "\R". This particular group matches either the two-character sequence
CR + LF ("\r\n"), or one of the single characters LF (linefeed, U+000A, "\n"), VT
(vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), CR (carriage return,
U+000D, "\r"), NEL (next line, U+0085), LS (line separator, U+2028), or PS
(paragraph separator, U+2029).
</para>
<para>
The behaviour of the dot, circumflex, and dollar metacharacters are affected by
newline characters, the default is to recognize any newline character (the same
characters recognized by "\R"). This can be changed with #G_REGEX_NEWLINE_CR,
#G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF compile options,
and with #G_REGEX_MATCH_NEWLINE_ANY, #G_REGEX_MATCH_NEWLINE_CR,
#G_REGEX_MATCH_NEWLINE_LF and #G_REGEX_MATCH_NEWLINE_CRLF match options.
These settings are also relevant when compiling a pattern if
#G_REGEX_EXTENDED is set, and an unescaped "#" outside a character class is
encountered. This indicates a comment that lasts until after the next
newline.
</para>
<para>
If you have two threads manipulating the same #GRegex, they must use a
lock to synchronize their operation, as these functions are not threadsafe.
Creating and manipulating different #GRegex structures from different
threads is not a problem.
</para>
<para>
The regular expressions low level functionalities are obtained through
the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
written by Philip Hazel.
</para>
<!-- ##### SECTION See_Also ##### -->
<para>
</para>
<!-- ##### SECTION Stability_Level ##### -->
<!-- ##### ENUM GRegexError ##### -->
<para>
Error codes returned by regular expressions functions.
</para>
@G_REGEX_ERROR_COMPILE: Compilation of the regular expression in <function>g_regex_new()</function> failed.
@G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression in <function>g_regex_optimize()</function> failed.
@G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement string.
@G_REGEX_ERROR_MATCH: The match process failed.
@Since: 2.14
<!-- ##### MACRO G_REGEX_ERROR ##### -->
<para>
Error domain for regular expressions. Errors in this domain will be from the #GRegexError enumeration. See #GError for information on error domains.
</para>
@Since: 2.14
<!-- ##### ENUM GRegexCompileFlags ##### -->
<para>
Flags specifying compile-time options.
</para>
@G_REGEX_CASELESS: Letters in the pattern match both upper and lower case
letters. It be changed within a pattern by a "(?i)" option setting.
@G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting
of a single line of characters (even if it actually contains newlines).
The "start of line" metacharacter ("^") matches only at the start of the
string, while the "end of line" metacharacter ("$") matches only at the
end of the string, or before a terminating newline (unless
#G_REGEX_DOLLAR_ENDONLY is set). When #G_REGEX_MULTILINE is set,
the "start of line" and "end of line" constructs match immediately following
or immediately before any newline in the string, respectively, as well
as at the very start and end. This can be changed within a pattern by a
"(?m)" option setting.
@G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all
characters, including newlines. Without it, newlines are excluded. This
option can be changed within a pattern by a ("?s") option setting.
@G_REGEX_EXTENDED: Whitespace data characters in the pattern are
totally ignored except when escaped or inside a character class.
Whitespace does not include the VT character (code 11). In addition,
characters between an unescaped "#" outside a character class and
the next newline character, inclusive, are also ignored. This can be
changed within a pattern by a "(?x)" option setting.
@G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is,
it is constrained to match only at the first matching point in the string
that is being searched. This effect can also be achieved by appropriate
constructs in the pattern itself such as the "^" metacharater.
@G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern
matches only at the end of the string. Without this option, a dollar also
matches immediately before the final character if it is a newline (but
not before any other newlines). This option is ignored if
#G_REGEX_MULTILINE is set.
@G_REGEX_UNGREEDY: Inverts the "greediness" of the
quantifiers so that they are not greedy by default, but become greedy
if followed by "?". It can also be set by a "(?U)" option setting within
the pattern.
@G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this
flag they are considered as a raw sequence of bytes.
@G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing
parentheses in the pattern. Any opening parenthesis that is not followed
by "?" behaves as if it were followed by "?:" but named parentheses can
still be used for capturing (and they acquire numbers in the usual way).
@G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not
be unique. This can be helpful for certain types of pattern when it is known
that only one instance of the named subpattern can ever be matched.
@G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this
option is set, the only recognized newline character is '\r'.
@G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this
option is set, the only recognized newline character is '\n'.
@G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this
option is set, the only recognized newline character sequence is '\r\n'.
@Since: 2.14
<!-- ##### ENUM GRegexMatchFlags ##### -->
<para>
Flags specifying match-time options.
</para>
@G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is,
it is constrained to match only at the first matching point in the string
that is being searched. This effect can also be achieved by appropriate
constructs in the pattern itself such as the "^" metacharater.
@G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is
not the beginning of a line, so the circumflex metacharacter should not
match before it. Setting this without G_REGEX_MULTILINE (at compile time)
causes circumflex never to match. This option affects only the behaviour of
the circumflex metacharacter, it does not affect "\A".
@G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is
not the end of a line, so the dollar metacharacter should not match it nor
(except in multiline mode) a newline immediately before it. Setting this
without G_REGEX_MULTILINE (at compile time) causes dollar never to match.
This option affects only the behaviour of the dollar metacharacter, it does
not affect "\Z" or "\z".
@G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid
match if this option is set. If there are alternatives in the pattern, they
are tried. If all the alternatives match the empty string, the entire match
fails. For example, if the pattern "a?b?" is applied to a string not beginning
with "a" or "b", it matches the empty string at the start of the string.
With this flag set, this match is not valid, so GRegex searches further
into the string for occurrences of "a" or "b".
@G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more
documentation on partial matching see g_regex_is_partial_match().
@G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when creating
a new #GRegex, setting the '\r' character as line terminator.
@G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when creating
a new #GRegex, setting the '\n' character as line terminator.
@G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when creating
a new #GRegex, setting the '\r\n' characters as line terminator.
@G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when creating
a new #GRegex, any newline character or character sequence is recognized.
@Since: 2.14
<!-- ##### STRUCT GRegex ##### -->
<para>
A GRegex is the "compiled" form of a regular expression pattern. This
structure is opaque and its fields cannot be accessed directly.
</para>
@Since: 2.14
<!-- ##### USER_FUNCTION GRegexEvalCallback ##### -->
<para>
Specifies the type of the function passed to g_regex_replace_eval().
It is called for each occurance of the pattern @regex in @string, and it
should append the replacement to @result.
</para>
<para>
Do not call on @regex functions that modify its internal state, such as
g_regex_match(); if you need it you can create a temporary copy of
@regex using g_regex_copy().
</para>
@Param1: a #GRegex.
@Param2: the string used to perform matches against.
@Param3: a #GString containing the new string.
@Param4: user data passed to g_regex_replace_eval().
@Returns: %FALSE to continue the replacement process, %TRUE to stop it.
@Since: 2.14
<!-- ##### FUNCTION g_regex_new ##### -->
<para>
</para>
@pattern:
@compile_options:
@match_options:
@error:
@Returns:
<!-- ##### FUNCTION g_regex_free ##### -->
<para>
</para>
@regex:
<!-- ##### FUNCTION g_regex_optimize ##### -->
<para>
</para>
@regex:
@error:
@Returns:
<!-- ##### FUNCTION g_regex_copy ##### -->
<para>
</para>
@regex:
@Returns:
<!-- ##### FUNCTION g_regex_get_pattern ##### -->
<para>
</para>
@regex:
@Returns:
<!-- ##### FUNCTION g_regex_clear ##### -->
<para>
</para>
@regex:
<!-- ##### FUNCTION g_regex_match_simple ##### -->
<para>
</para>
@pattern:
@string:
@compile_options:
@match_options:
@Returns:
<!-- ##### FUNCTION g_regex_match ##### -->
<para>
</para>
@regex:
@string:
@match_options:
@Returns:
<!-- ##### FUNCTION g_regex_match_full ##### -->
<para>
</para>
@regex:
@string:
@string_len:
@start_position:
@match_options:
@error:
@Returns:
<!-- ##### FUNCTION g_regex_match_next ##### -->
<para>
</para>
@regex:
@string:
@match_options:
@Returns:
<!-- ##### FUNCTION g_regex_match_next_full ##### -->
<para>
</para>
@regex:
@string:
@string_len:
@start_position:
@match_options:
@error:
@Returns:
<!-- ##### FUNCTION g_regex_match_all ##### -->
<para>
</para>
@regex:
@string:
@match_options:
@Returns:
<!-- ##### FUNCTION g_regex_match_all_full ##### -->
<para>
</para>
@regex:
@string:
@string_len:
@start_position:
@match_options:
@error:
@Returns:
<!-- ##### FUNCTION g_regex_get_match_count ##### -->
<para>
</para>
@regex:
@Returns:
<!-- ##### FUNCTION g_regex_is_partial_match ##### -->
<para>
</para>
@regex:
@Returns:
<!-- ##### FUNCTION g_regex_fetch ##### -->
<para>
</para>
@regex:
@match_num:
@string:
@Returns:
<!-- ##### FUNCTION g_regex_fetch_pos ##### -->
<para>
</para>
@regex:
@match_num:
@start_pos:
@end_pos:
@Returns:
<!-- ##### FUNCTION g_regex_fetch_named ##### -->