Commit 3abe31a0 authored by Jeffery To's avatar Jeffery To

def.lang: Add Unicode identifier category regular expressions

This adds regular expressions for the Unicode ID_Start and ID_Continue
categories to def.lang, so that they may be reused in different language
definitions.

(These regular expressions are necessary because GRegex does not yet
support referencing these categories directly, e.g. \p{ID_Start}.)

This also updates python3.lang to use these new regular expressions.
parent 7548a495
......@@ -124,6 +124,17 @@
<!-- $^ never matches. -->
<define-regex id="never-match">$^</define-regex>
<!-- Unicode ID_Start and ID_Continue characters
Descriptions from https://www.unicode.org/reports/tr31/#Table_Lexical_Classes_for_Identifiers
Other_ID_Start and Other_ID_Continue code points from https://www.unicode.org/Public/12.1.0/ucd/PropList.txt
-->
<define-regex id="unicode-id-start" extended="true">
[\p{L}\p{Nl}\x{1885}-\x{1886}\x{2118}\x{212E}\x{309B}-\x{309C}]
</define-regex>
<define-regex id="unicode-id-continue" extended="true">
[\p{L}\p{Nl}\x{1885}-\x{1886}\x{2118}\x{212E}\x{309B}-\x{309C}\p{Mn}\p{Mc}\p{Nd}\p{Pc}\x{00B7}\x{0387}\x{1369}-\x{1371}\x{19DA}]
</define-regex>
<define-regex id="decimal" extended="true">
(?&lt;![\w\.]) ([1-9][0-9]* | 0) (?![\w\.])
</define-regex>
......
......@@ -50,22 +50,11 @@
</styles>
<definitions>
<!-- https://docs.python.org/3/reference/lexical_analysis.html#identifiers -->
<define-regex id="id-start" extended="true">
(?:
[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_] |
[\x{1885}-\x{1886}\x{2118}\x{212E}\x{309B}-\x{309C}] # Other_ID_Start (Unicode 12.1.0)
)
</define-regex>
<define-regex id="id-continue" extended="true">
(?:
\%{id-start} |
[\p{Mn}\p{Mc}\p{Nd}\p{Pc}] |
[\x{00B7}\x{0387}\x{1369}-\x{1371}\x{19DA}] # Other_ID_Continue (Unicode 12.1.0)
)
</define-regex>
<!-- https://docs.python.org/3/reference/lexical_analysis.html#identifiers
Underscore ('LOW LINE' U+005F) is a member of \p{Pc}, and so included in def:unicode-id-continue
-->
<define-regex id="identifier" extended="true">
(?&gt; \%{id-start} \%{id-continue}* )
(?&gt; (?: _ | \%{def:unicode-id-start} ) \%{def:unicode-id-continue}* )
</define-regex>
<define-regex id="number">[1-9][0-9]*</define-regex>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment