A proper `send_string()` for the Unicode feature (#8155)

author: Ryan <fauxpark@gmail.com> 2020-02-24 10:27:25 +1100
committer: GitHub <noreply@github.com> 2020-02-24 10:27:25 +1100
commit: 371ff9dd6f9c4feada34622d9a43480495b07e50 (patch)
tree: 785ba3038f5e431f1192c09f11d022c2c6850555
parent: 716c29881c0f91e3998e1fc5c49740bd6e65876f (diff)
3 files changed, 67 insertions, 5 deletions
diff --git a/docs/feature_unicode.md b/docs/feature_unicode.md
index bd1f4fa5ae..546af2521a 100644
--- a/docs/feature_unicode.md
+++ b/docs/feature_unicode.md
@@ -193,12 +193,24 @@ By default, when the keyboard boots, it will initialize the input mode to the la
 
 !> Using `UNICODE_SELECTED_MODES` means you don't have to initially set the input mode in `matrix_init_user()` (or a similar function); the Unicode system will do that for you on startup. This has the added benefit of avoiding unnecessary writes to EEPROM.
 
-## `send_unicode_hex_string`
+## `send_unicode_string()`
 
-To type multiple characters for things like (ノಠ痊ಠ)ノ彡┻━┻, you can use `send_unicode_hex_string()` much like `SEND_STRING()` except you would use hex values separate by spaces.
-For example, the table flip seen above would be `send_unicode_hex_string("0028 30CE 0CA0 75CA 0CA0 0029 30CE 5F61 253B 2501 253B")`
+This function is much like `send_string()` but allows you to input UTF-8 characters directly, currently up to code point U+FFFF. Make sure your `keymap.c` is formatted in UTF-8 encoding.
 
-There are many ways to get a hex code, but an easy one is [this site](https://r12a.github.io/app-conversion/). Just make sure to convert to hexadecimal, and that is your string.
+```c
+send_unicode_string("(ノಠ痊ಠ)ノ彡┻━┻");
+```
+
+## `send_unicode_hex_string()`
+
+Similar to `send_unicode_string()`, but the characters are represented by their code point values in ASCII, separated by spaces. For example, the table flip above would be achieved with:
+
+```c
+send_unicode_hex_string("0028 30CE 0CA0 75CA 0CA0 0029 30CE 5F61 253B 2501 253B");
+```
+
+An easy way to convert your Unicode string to this format is by using [this site](https://r12a.github.io/app-conversion/), and taking the result in the "Hex/UTF-32" section.
+Unlike `send_unicode_string()` this function supports code points up to U+10FFFF.
 
 ## Additional Language Support
 
@@ -228,6 +240,6 @@ AutoHotkey inserts the Text right of `Send, ` when this combination is pressed.
 
 If you enable the US International layout on the system, it will use punctuation to accent the characters.
 
-For instance, typing "`a" will result in à.
+For instance, typing "\`a" will result in à.
 
 You can find details on how to enable this [here](https://support.microsoft.com/en-us/help/17424/windows-change-keyboard-layout).
diff --git a/quantum/process_keycode/process_unicode_common.c b/quantum/process_keycode/process_unicode_common.c
index 94383f19b7..4ac305e661 100644
--- a/quantum/process_keycode/process_unicode_common.c
+++ b/quantum/process_keycode/process_unicode_common.c
@@ -178,6 +178,55 @@ void send_unicode_hex_string(const char *str) {
     }
 }
 
+// Borrowed from https://nullprogram.com/blog/2017/10/06/
+const char *decode_utf8(const char *str, int32_t *code_point) {
+    const char *next;
+
+    if (str[0] < 0x80) {  // U+0000-007F
+        *code_point = str[0];
+        next        = str + 1;
+    } else if ((str[0] & 0xE0) == 0xC0) {  // U+0080-07FF
+        *code_point = ((int32_t)(str[0] & 0x1F) << 6) | ((int32_t)(str[1] & 0x3F) << 0);
+        next        = str + 2;
+    } else if ((str[0] & 0xF0) == 0xE0) {  // U+0800-FFFF
+        *code_point = ((int32_t)(str[0] & 0x0F) << 12) | ((int32_t)(str[1] & 0x3F) << 6) | ((int32_t)(str[2] & 0x3F) << 0);
+        next        = str + 3;
+    } else if ((str[0] & 0xF8) == 0xF0 && (str[0] <= 0xF4)) {  // U+10000-10FFFF
+        // Skip for now - register_hex() only takes a uint16
+        //*code_point = ((int32_t)(str[0] & 0x07) << 18) | ((int32_t)(str[1] & 0x3F) << 12) | ((int32_t)(str[2] & 0x3F) << 6) | ((int32_t)(str[3] & 0x3F) << 0);
+        *code_point = -1;
+        next        = str + 4;
+    } else {
+        *code_point = -1;
+        next        = str + 1;
+    }
+
+    // part of a UTF-16 surrogate pair - invalid
+    if (*code_point >= 0xD800 && *code_point <= 0xDFFF) {
+        *code_point = -1;
+    }
+
+    return next;
+}
+
+void send_unicode_string(const char *str) {
+    if (!str) {
+        return;
+    }
+
+    int32_t code_point = 0;
+
+    while (*str) {
+        str = decode_utf8(str, &code_point);
+
+        if (code_point >= 0) {
+            unicode_input_start();
+            register_hex(code_point);
+            unicode_input_finish();
+        }
+    }
+}
+
 bool process_unicode_common(uint16_t keycode, keyrecord_t *record) {
     if (record->event.pressed) {
         switch (keycode) {
diff --git a/quantum/process_keycode/process_unicode_common.h b/quantum/process_keycode/process_unicode_common.h
index cab6eea6ea..393db2d99e 100644
--- a/quantum/process_keycode/process_unicode_common.h
+++ b/quantum/process_keycode/process_unicode_common.h
@@ -80,6 +80,7 @@ void unicode_input_cancel(void);
 
 void register_hex(uint16_t hex);
 void send_unicode_hex_string(const char *str);
+void send_unicode_string(const char *str);
 
 bool process_unicode_common(uint16_t keycode, keyrecord_t *record);
author	Ryan <fauxpark@gmail.com>	2020-02-24 10:27:25 +1100
committer	GitHub <noreply@github.com>	2020-02-24 10:27:25 +1100
commit	371ff9dd6f9c4feada34622d9a43480495b07e50 (patch)
tree	785ba3038f5e431f1192c09f11d022c2c6850555
parent	716c29881c0f91e3998e1fc5c49740bd6e65876f (diff)