Commit dfcf34ef authored by Christian Hergert's avatar Christian Hergert

pcre2: port GtkSourceRegex to PCRE2

The goal here is to move to a JIT backed PCRE2 implementation, however
this just gets an initial port in place to use PCRE2 for this code. We can
eventually move other parts of GtkSourceView over to this implementation
as well depending on how complete we need to be.
parent 1c6e2ce1
...@@ -354,8 +354,8 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex, ...@@ -354,8 +354,8 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex,
gint *start_pos_p, /* byte offsets */ gint *start_pos_p, /* byte offsets */
gint *end_pos_p) /* byte offsets */ gint *end_pos_p) /* byte offsets */
{ {
gint start_pos; gint start_pos = -1;
gint end_pos; gint end_pos = -1;
g_assert (regex->resolved); g_assert (regex->resolved);
......
...@@ -41,6 +41,7 @@ gboolean impl_regex_match (const ImplRegex *regex, ...@@ -41,6 +41,7 @@ gboolean impl_regex_match (const ImplRegex *regex,
const char *string, const char *string,
GRegexMatchFlags match_options, GRegexMatchFlags match_options,
ImplMatchInfo **match_info); ImplMatchInfo **match_info);
ImplRegex *impl_regex_ref (ImplRegex *regex);
void impl_regex_unref (ImplRegex *regex); void impl_regex_unref (ImplRegex *regex);
void impl_match_info_free (ImplMatchInfo *match_info); void impl_match_info_free (ImplMatchInfo *match_info);
char *impl_match_info_fetch (const ImplMatchInfo *match_info, char *impl_match_info_fetch (const ImplMatchInfo *match_info,
...@@ -50,7 +51,7 @@ char *impl_match_info_fetch_named (const ImplMatchInfo *match_info, ...@@ -50,7 +51,7 @@ char *impl_match_info_fetch_named (const ImplMatchInfo *match_info,
char *impl_regex_replace_eval (const ImplRegex *regex, char *impl_regex_replace_eval (const ImplRegex *regex,
const char *string, const char *string,
gssize string_len, gssize string_len,
int start_position, gsize start_position,
GRegexMatchFlags match_options, GRegexMatchFlags match_options,
ImplRegexEvalCallback eval, ImplRegexEvalCallback eval,
gpointer user_data, gpointer user_data,
...@@ -58,18 +59,21 @@ char *impl_regex_replace_eval (const ImplRegex *regex, ...@@ -58,18 +59,21 @@ char *impl_regex_replace_eval (const ImplRegex *regex,
gboolean impl_regex_match_full (const ImplRegex *regex, gboolean impl_regex_match_full (const ImplRegex *regex,
const char *string, const char *string,
gssize string_len, gssize string_len,
int start_position, gsize start_position,
GRegexMatchFlags match_options, GRegexMatchFlags match_options,
ImplMatchInfo **match_info, ImplMatchInfo **match_info,
GError **error); GError **error);
gboolean impl_match_info_fetch_pos (const ImplMatchInfo *match_info, gboolean impl_match_info_fetch_pos (const ImplMatchInfo *match_info,
int match_num, guint match_num,
int *start_pos, int *start_pos,
int *end_pos); int *end_pos);
gboolean impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info, gboolean impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
const char *name, const char *name,
int *start_pos, int *start_pos,
int *end_pos); int *end_pos);
gboolean impl_match_info_matches (const ImplMatchInfo *match_info);
gboolean impl_match_info_next (ImplMatchInfo *match_info,
GError **error);
const char *impl_regex_get_pattern (const ImplRegex *regex); const char *impl_regex_get_pattern (const ImplRegex *regex);
G_END_DECLS G_END_DECLS
/* /*
* This file is part of GtkSourceView * This file is part of GtkSourceView
* *
* Copyright 1999, 2000 Scott Wimer
* Copyright 2004, Matthias Clasen <mclasen@redhat.com>
* Copyright 2005 - 2007, Marco Barisione <marco@barisione.org>
* Copyright 2020 Christian Hergert <chergert@redhat.com> * Copyright 2020 Christian Hergert <chergert@redhat.com>
* *
* GtkSourceView is free software; you can redistribute it and/or * GtkSourceView is free software; you can redistribute it and/or
...@@ -21,46 +24,96 @@ ...@@ -21,46 +24,96 @@
#include "config.h" #include "config.h"
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#include <string.h>
#include "implregex-private.h" #include "implregex-private.h"
#include "gtksourcetrace.h"
struct _ImplRegex struct _ImplRegex
{ {
int ref_count; int ref_count;
char *pattern; char *pattern;
GRegex *re; gsize compile_flags;
gsize match_flags;
pcre2_compile_context *context;
pcre2_code *code;
PCRE2_SPTR name_table;
int name_count;
int name_entry_size;
}; };
struct _ImplMatchInfo struct _ImplMatchInfo
{ {
GMatchInfo *match_info; gsize compile_flags;
gsize match_flags;
ImplRegex *regex;
const char *string;
gsize string_len;
pcre2_match_data *match_data;
PCRE2_SIZE *offsets;
int n_groups;
gsize start_pos;
}; };
#if 0 static gsize
static void translate_compile_flags (GRegexCompileFlags flags)
set_regex_error (GError **error,
int errnum)
{ {
guchar errstr[128]; gsize ret = 0;
if (!(flags & G_REGEX_RAW))
ret |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
if (flags & G_REGEX_ANCHORED)
ret |= PCRE2_ANCHORED;
if (flags & G_REGEX_CASELESS)
ret |= PCRE2_CASELESS;
pcre2_get_error_message (errnum, errstr, sizeof errstr - 1); if (flags & G_REGEX_EXTENDED)
errstr[sizeof errstr - 1] = 0; ret |= PCRE2_EXTENDED;
g_set_error_literal (error, ret |= PCRE2_UCP;
G_REGEX_ERROR, ret |= PCRE2_BSR_UNICODE;
G_REGEX_ERROR_COMPILE,
(const gchar *)errstr); return ret;
} }
#endif
static ImplMatchInfo * static gsize
impl_match_info_new (const ImplRegex *regex) translate_match_flags (GRegexMatchFlags flags)
{ {
ImplMatchInfo *match_info; gsize ret = 0;
match_info = g_slice_new0 (ImplMatchInfo); if (flags & G_REGEX_MATCH_ANCHORED)
match_info->match_info = NULL; ret |= PCRE2_ANCHORED;
return match_info; return ret;
}
static gboolean
set_regex_error (GError **error,
int rc)
{
if (rc > 0)
{
return FALSE;
}
if (error != NULL)
{
guchar errstr[128];
pcre2_get_error_message (rc, errstr, sizeof errstr - 1);
errstr[sizeof errstr - 1] = 0;
g_set_error_literal (error,
G_REGEX_ERROR,
G_REGEX_ERROR_MATCH,
(const gchar *)errstr);
}
return TRUE;
} }
ImplRegex * ImplRegex *
...@@ -69,22 +122,83 @@ impl_regex_new (const char *pattern, ...@@ -69,22 +122,83 @@ impl_regex_new (const char *pattern,
GRegexMatchFlags match_options, GRegexMatchFlags match_options,
GError **error) GError **error)
{ {
GRegex *re; pcre2_compile_context *context;
ImplRegex *regex; ImplRegex *regex;
PCRE2_SIZE erroffset;
int errnumber = -1;
#ifdef GTK_SOURCE_PROFILER_ENABLED
char *message = NULL;
GTK_SOURCE_PROFILER_BEGIN_MARK;
#endif
g_return_val_if_fail (pattern != NULL, NULL); g_return_val_if_fail (pattern != NULL, NULL);
g_return_val_if_fail (strstr (pattern, "\\K") == NULL, NULL);
re = g_regex_new (pattern, compile_options, match_options, error); context = pcre2_compile_context_create (NULL);
if (re == NULL) regex = g_slice_new0 (ImplRegex);
regex->ref_count = 1;
regex->context = context;
regex->pattern = g_strdup (pattern);
regex->compile_flags = translate_compile_flags (compile_options);
regex->match_flags = translate_match_flags (match_options);
if (compile_options & G_REGEX_NEWLINE_LF)
pcre2_set_newline (context, PCRE2_NEWLINE_LF);
else if (compile_options & G_REGEX_NEWLINE_CR)
pcre2_set_newline (context, PCRE2_NEWLINE_CR);
else if (compile_options & G_REGEX_NEWLINE_CRLF)
pcre2_set_newline (context, PCRE2_NEWLINE_CRLF);
else if (compile_options & G_REGEX_NEWLINE_ANYCRLF)
pcre2_set_newline (context, PCRE2_NEWLINE_ANYCRLF);
regex->code = pcre2_compile ((PCRE2_SPTR)pattern,
PCRE2_ZERO_TERMINATED,
regex->compile_flags,
&errnumber,
&erroffset,
context);
if (regex->code == NULL)
{ {
char errmsg[128];
pcre2_get_error_message (errnumber, (guchar *)errmsg, sizeof errmsg-1);
errmsg[sizeof errmsg-1] = 0;
g_set_error (error,
G_REGEX_ERROR,
G_REGEX_ERROR_COMPILE,
"%s: offset %d of pattern %s",
errmsg,
(int)erroffset,
pattern);
impl_regex_unref (regex);
return NULL; return NULL;
} }
regex = g_slice_new0 (ImplRegex); pcre2_pattern_info (regex->code, PCRE2_INFO_NAMECOUNT, &regex->name_count);
regex->ref_count = 1;
regex->pattern = g_strdup (pattern); if (regex->name_count > 0)
regex->re = re; {
pcre2_pattern_info (regex->code,
PCRE2_INFO_NAMEENTRYSIZE,
&regex->name_entry_size);
pcre2_pattern_info (regex->code,
PCRE2_INFO_NAMETABLE,
&regex->name_table);
}
#ifdef GTK_SOURCE_PROFILER_ENABLED
if (GTK_SOURCE_PROFILER_ACTIVE)
message = g_strdup_printf ("compile=%lx match=%lx pattern=%s",
regex->compile_flags,
regex->match_flags,
regex->pattern);
GTK_SOURCE_PROFILER_END_MARK (G_STRFUNC, message);
g_free (message);
#endif
return regex; return regex;
} }
...@@ -97,6 +211,17 @@ impl_regex_get_pattern (const ImplRegex *regex) ...@@ -97,6 +211,17 @@ impl_regex_get_pattern (const ImplRegex *regex)
return regex->pattern; return regex->pattern;
} }
ImplRegex *
impl_regex_ref (ImplRegex *regex)
{
g_return_val_if_fail (regex != NULL, NULL);
g_return_val_if_fail (regex->ref_count > 0, NULL);
regex->ref_count++;
return regex;
}
void void
impl_regex_unref (ImplRegex *regex) impl_regex_unref (ImplRegex *regex)
{ {
...@@ -108,16 +233,64 @@ impl_regex_unref (ImplRegex *regex) ...@@ -108,16 +233,64 @@ impl_regex_unref (ImplRegex *regex)
if (regex->ref_count == 0) if (regex->ref_count == 0)
{ {
g_clear_pointer (&regex->pattern, g_free); g_clear_pointer (&regex->pattern, g_free);
g_clear_pointer (&regex->re, g_regex_unref); g_clear_pointer (&regex->code, pcre2_code_free);
g_clear_pointer (&regex->context, pcre2_compile_context_free);
g_slice_free (ImplRegex, regex); g_slice_free (ImplRegex, regex);
} }
} }
static ImplMatchInfo *
impl_match_info_new (ImplRegex *regex,
GRegexMatchFlags match_options,
const char *string,
gssize string_len)
{
ImplMatchInfo *match_info;
g_assert (regex != NULL);
g_assert (string != NULL);
g_assert (string_len <= strlen (string));
if (string_len < 0)
{
string_len = strlen (string);
}
match_info = g_slice_new0 (ImplMatchInfo);
match_info->regex = impl_regex_ref (regex);
match_info->match_flags = regex->match_flags | translate_match_flags (match_options);
match_info->start_pos = -1;
match_info->n_groups = -1;
match_info->string = string;
match_info->string_len = string_len;
match_info->match_data = pcre2_match_data_create_from_pattern (regex->code, NULL);
if (match_info->match_data == NULL)
{
g_error ("Failed to allocate match data");
}
match_info->offsets = pcre2_get_ovector_pointer (match_info->match_data);
return match_info;
}
void void
impl_match_info_free (ImplMatchInfo *match_info) impl_match_info_free (ImplMatchInfo *match_info)
{ {
g_clear_pointer (&match_info->match_info, g_match_info_free); if (match_info != NULL)
g_slice_free (ImplMatchInfo, match_info); {
g_clear_pointer (&match_info->match_data, pcre2_match_data_free);
g_clear_pointer (&match_info->regex, impl_regex_unref);
match_info->string = NULL;
match_info->string_len = 0;
match_info->compile_flags = 0;
match_info->match_flags = 0;
match_info->n_groups = 0;
match_info->start_pos = 0;
match_info->offsets = NULL;
g_slice_free (ImplMatchInfo, match_info);
}
} }
gboolean gboolean
...@@ -127,115 +300,156 @@ impl_regex_match (const ImplRegex *regex, ...@@ -127,115 +300,156 @@ impl_regex_match (const ImplRegex *regex,
ImplMatchInfo **match_info) ImplMatchInfo **match_info)
{ {
g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (regex != NULL, FALSE);
g_return_val_if_fail (regex->re != NULL, FALSE); g_return_val_if_fail (regex->code != NULL, FALSE);
g_return_val_if_fail (string != NULL, FALSE);
if (match_info != NULL) return impl_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL);
{
*match_info = impl_match_info_new (regex);
}
return g_regex_match (regex->re,
string,
match_options,
match_info ? &(*match_info)->match_info : NULL);
} }
char * char *
impl_match_info_fetch (const ImplMatchInfo *match_info, impl_match_info_fetch (const ImplMatchInfo *match_info,
int match_num) int match_num)
{ {
int begin = -1;
int end = -1;
g_return_val_if_fail (match_info != NULL, NULL); g_return_val_if_fail (match_info != NULL, NULL);
g_return_val_if_fail (match_info->string != NULL, NULL);
g_return_val_if_fail (match_info->offsets != NULL, NULL);
g_return_val_if_fail (impl_match_info_matches (match_info), NULL);
if (impl_match_info_fetch_pos (match_info, match_num, &begin, &end))
{
if (begin >= 0 && end >= 0)
{
return g_strndup (match_info->string + begin, end - begin);
}
return g_strdup ("");
}
return g_match_info_fetch (match_info->match_info, match_num); return NULL;
} }
char * char *
impl_match_info_fetch_named (const ImplMatchInfo *match_info, impl_match_info_fetch_named (const ImplMatchInfo *match_info,
const char *name) const char *name)
{ {
int begin = -1;
int end = -1;
g_return_val_if_fail (match_info != NULL, NULL); g_return_val_if_fail (match_info != NULL, NULL);
return g_match_info_fetch_named (match_info->match_info, name); if (match_info->start_pos < match_info->string_len)
} {
if (impl_match_info_fetch_named_pos (match_info, name, &begin, &end))
{
if (begin >= 0 && end >= 0)
{
return g_strndup (match_info->string + begin, end - begin);
}
}
}
static gboolean return NULL;
wrapper_eval (const GMatchInfo *match_info,
GString *result,
gpointer user_data)
{
struct {
ImplRegexEvalCallback callback;
gpointer user_data;
} *wrapper = user_data;
ImplMatchInfo wrapped = {
.match_info = (GMatchInfo *)match_info,
};
return wrapper->callback (&wrapped, result, wrapper->user_data);
} }
char * char *
impl_regex_replace_eval (const ImplRegex *regex, impl_regex_replace_eval (const ImplRegex *regex,
const char *string, const char *string,
gssize string_len, gssize string_len,
int start_position, gsize start_position,
GRegexMatchFlags match_options, GRegexMatchFlags match_options,
ImplRegexEvalCallback eval, ImplRegexEvalCallback eval,
gpointer user_data, gpointer user_data,
GError **error) GError **error)
{ {
struct { ImplMatchInfo *match_info;
ImplRegexEvalCallback callback; GString *result;
gpointer user_data; gsize str_pos = 0;
} wrapper; gboolean done = FALSE;
GError *tmp_error = NULL;
g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (regex != NULL, NULL);
g_return_val_if_fail (regex->re != NULL, NULL); g_return_val_if_fail (string != NULL, NULL);
g_return_val_if_fail (eval != NULL, NULL);
wrapper.callback = eval;
wrapper.user_data = user_data; if (string_len < 0)
{
return g_regex_replace_eval (regex->re, string_len = strlen (string);
string, }
string_len,
start_position, result = g_string_sized_new (string_len);
match_options,
wrapper_eval, /* run down the string making matches. */
&wrapper, impl_regex_match_full (regex,
error); string,
string_len,
start_position,
match_options,
&match_info,
&tmp_error);
g_assert (match_info != NULL);
while (!done && impl_match_info_matches (match_info))
{
g_string_append_len (result,
string + str_pos,
match_info->offsets[0] - str_pos);
done = (*eval) (match_info, result, user_data);
str_pos = match_info->offsets[1];
impl_match_info_next (match_info, &tmp_error);
}
impl_match_info_free (match_info);
if (tmp_error != NULL)
{
g_propagate_error (error, tmp_error);
g_string_free (result, TRUE);
return NULL;
}
g_string_append_len (result, string + str_pos, string_len - str_pos);
return g_string_free (result, FALSE);
} }
gboolean gboolean
impl_regex_match_full (const ImplRegex *regex, impl_regex_match_full (const ImplRegex *regex,
const char *string, const char *string,
gssize string_len, gssize string_len,
int start_position, gsize start_position,
GRegexMatchFlags match_options, GRegexMatchFlags match_options,
ImplMatchInfo **match_info, ImplMatchInfo **match_info,
GError **error) GError **error)
{ {
GMatchInfo *wrapped = NULL; ImplMatchInfo *local_match_info = NULL;
gboolean ret; gboolean ret = FALSE;
g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (regex != NULL, FALSE);
g_return_val_if_fail (regex->re != NULL, FALSE); g_return_val_if_fail (regex->code != NULL, FALSE);
g_return_val_if_fail (match_options == 0, FALSE);
g_return_val_if_fail (string != NULL, FALSE);
if (string_len < 0)
{
string_len = strlen (string);
}