Add support for \x \u and \U escape sequences.

For all escape sequences the 0 byte is illegal and converted to a space. \x supports only the range from 0x01 to 0x7F as the values greater or equal to 0x80 could produce illegal UTF-8 sequences. \u allows to specify unicode codepoints with exactly 4 hex digits. \U allows to specify unicode codepoints with exactly 6 hex digits.
2014-06-22 04:13:48 +02:00 · 2014-06-22 04:13:48 +02:00 · 1a2fcc0559
parent 62aebbbb37
commit 1a2fcc0559
1 changed files with 21 additions and 0 deletions
--- a/src/lexer.l
+++ b/src/lexer.l
@ -26,6 +26,7 @@

 %{

+#include <glib.h>
 #include "typedefs.h"
 #include "handle_dep.h"
 #include "printutils.h"
@ -77,6 +78,7 @@ extern FileModule *rootmodule;
  }                                       \
 }

+void to_utf8(const char *, char *);
 void includefile();
 fs::path sourcepath();
 std::vector<fs::path> path_stack;
@ -97,6 +99,7 @@ std::string filepath;

 D [0-9]
 E [Ee][+-]?{D}+
+H [0-9a-fA-F]

 %%

@ -166,6 +169,8 @@ use[ \t\r\n>]*"<"	{ BEGIN(cond_use); }
 \\r			{ stringcontents += '\r'; }
 \\\\			{ stringcontents += '\\'; }
 \\\"			{ stringcontents += '"'; }
+\\x[0-7]{H}             { unsigned long i = strtoul(lexertext + 2, NULL, 16); stringcontents += (i == 0 ? ' ' : (unsigned char)(i & 0xff)); }
+\\u{H}{4}|\\U{H}{6}     { char buf[8]; to_utf8(lexertext + 2, buf); stringcontents += buf; }
 [^\\\n\"]+		{ stringcontents += lexertext; }
 \"			{ BEGIN(INITIAL); 
 			parserlval.text = strdup(stringcontents.c_str()); 
@ -194,6 +199,22 @@ use[ \t\r\n>]*"<"	{ BEGIN(cond_use); }

 %%

+/*!
+ * Convert unicode codepoint given in hex notation
+ * into UTF8 encoding. The output buffer must be 8
+ * characters long.
+ */
+void to_utf8(const char *str, char *out)
+{
+    memset(out, 0, 8);
+    const gunichar c = strtoul(str, NULL, 16);
+    if (g_unichar_validate(c) && (c != 0)) {
+        g_unichar_to_utf8(c, out);
+    } else {
+        out[0] = ' ';
+    }
+}
+
 fs::path sourcepath()
 {
  if (!path_stack.empty()) return path_stack.back();