From 06735e32566ca2250afdc371b8b2521ee07ad922 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 29 Oct 2008 08:04:54 +0000 Subject: Unicode escapes in strings and identifiers --- doc/src/sgml/syntax.sgml | 142 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 134 insertions(+), 8 deletions(-) (limited to 'doc/src') diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index 0efba278c55..6c988011b7c 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -1,4 +1,4 @@ - + SQL Syntax @@ -189,6 +189,57 @@ UPDATE "my_table" SET "a" = 5; ampersands. The length limitation still applies. + + Unicode escapein + identifiers A variant of quoted + identifiers allows including escaped Unicode characters identified + by their code points. This variant starts + with U& (upper or lower case U followed by + ampersand) immediately before the opening double quote, without + any spaces in between, for example U&"foo". + (Note that this creates an ambiguity with the + operator &. Use spaces around the operator to + avoid this problem.) Inside the quotes, Unicode characters can be + specified in escaped form by writing a backslash followed by the + four-digit hexadecimal code point number or alternatively a + backslash followed by a plus sign followed by a six-digit + hexadecimal code point number. For example, the + identifier "data" could be written as + +U&"d\0061t\+000061" + + The following less trivial example writes the Russian + word slon (elephant) in Cyrillic letters: + +U&"\0441\043B\043E\043D" + + + + + If a different escape character than backslash is desired, it can + be specified using + the UESCAPEUESCAPE + clause after the string, for example: + +U&"d!0061t!+000061" UESCAPE '!' + + The escape character can be any single character other than a + hexadecimal digit, the plus sign, a single quote, a double quote, + or a whitespace character. Note that the escape character is + written in single quotes, not double quotes. + + + + To include the escape character in the identifier literally, write + it twice. + + + + The Unicode escape syntax works only when the server encoding is + UTF8. When other server encodings are used, only code points in + the ASCII range (up to \007F) can be specified. + + Quoting an identifier also makes it case-sensitive, whereas unquoted names are always folded to lower case. For example, the @@ -245,7 +296,7 @@ UPDATE "my_table" SET "a" = 5; write two adjacent single quotes, e.g. 'Dianne''s horse'. Note that this is not the same as a double-quote - character ("). + character ("). @@ -269,14 +320,19 @@ SELECT 'foo' 'bar'; by SQL; PostgreSQL is following the standard.) + - - + + String Constants with C-Style Escapes + + escape string syntax - + backslash escapes + + PostgreSQL also accepts escape string constants, which are an extension to the SQL standard. An escape string constant is specified by writing the letter @@ -287,7 +343,8 @@ SELECT 'foo' 'bar'; Within an escape string, a backslash character (\) begins a C-like backslash escape sequence, in which the combination of backslash and following character(s) represent a special byte - value: + value, as shown in . + Backslash Escape Sequences @@ -341,14 +398,24 @@ SELECT 'foo' 'bar';
- It is your responsibility that the byte sequences you create are - valid characters in the server character set encoding. Any other + + Any other character following a backslash is taken literally. Thus, to include a backslash character, write two backslashes (\\). Also, a single quote can be included in an escape string by writing \', in addition to the normal way of ''. + + It is your responsibility that the byte sequences you create are + valid characters in the server character set encoding. When the + server encoding is UTF-8, then the alternative Unicode escape + syntax, explained in , + should be used instead. (The alternative would be doing the + UTF-8 encoding by hand and writing out the bytes, which would be + very cumbersome.) + + If the configuration parameter @@ -379,6 +446,65 @@ SELECT 'foo' 'bar';
+ + String Constants with Unicode Escapes + + + Unicode escape + in string constants + + + + PostgreSQL also supports another type + of escape syntax for strings that allows specifying arbitrary + Unicode characters by code point. A Unicode escape string + constant starts with U& (upper or lower case + letter U followed by ampersand) immediately before the opening + quote, without any spaces in between, for + example U&'foo'. (Note that this creates an + ambiguity with the operator &. Use spaces + around the operator to avoid this problem.) Inside the quotes, + Unicode characters can be specified in escaped form by writing a + backslash followed by the four-digit hexadecimal code point + number or alternatively a backslash followed by a plus sign + followed by a six-digit hexadecimal code point number. For + example, the string 'data' could be written as + +U&'d\0061t\+000061' + + The following less trivial example writes the Russian + word slon (elephant) in Cyrillic letters: + +U&'\0441\043B\043E\043D' + + + + + If a different escape character than backslash is desired, it can + be specified using + the UESCAPEUESCAPE + clause after the string, for example: + + U&'d!0061t!+000061' UESCAPE '!' + + The escape character can be any single character other than a + hexadecimal digit, the plus sign, a single quote, a double quote, + or a whitespace character. + + + + The Unicode escape syntax works only when the server encoding is + UTF8. When other server encodings are used, only code points in + the ASCII range (up to \007F) can be + specified. + + + + To include the escape character in the string literally, write it + twice. + + + Dollar-Quoted String Constants -- cgit v1.2.3