blob: 5e4c4f948a4a2570649d4b83cac405539c3fc6b2 [file] [log] [blame]
Tom Rinif739fcd2018-05-07 17:02:21 -04001// SPDX-License-Identifier: GPL-2.0+
Rob Clark78178bb2017-09-09 06:47:40 -04002/*
3 * charset conversion utils
4 *
5 * Copyright (c) 2017 Rob Clark
Rob Clark78178bb2017-09-09 06:47:40 -04006 */
7
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +02008#include <common.h>
Rob Clark78178bb2017-09-09 06:47:40 -04009#include <charset.h>
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020010#include <capitalization.h>
Heinrich Schuchardt70616a12021-02-27 14:08:35 +010011#include <cp437.h>
Ilias Apalodimas6974a4a2020-11-22 15:10:26 +020012#include <efi_loader.h>
Heinrich Schuchardt73bb90c2021-02-27 14:08:36 +010013#include <errno.h>
Rob Clark78178bb2017-09-09 06:47:40 -040014#include <malloc.h>
15
Heinrich Schuchardt70616a12021-02-27 14:08:35 +010016/**
17 * codepage_437 - Unicode to codepage 437 translation table
18 */
19const u16 codepage_437[128] = CP437;
20
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020021static struct capitalization_table capitalization_table[] =
22#ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
23 UNICODE_CAPITALIZATION_TABLE;
24#elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
25 CP1250_CAPITALIZATION_TABLE;
26#else
27 CP437_CAPITALIZATION_TABLE;
28#endif
29
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020030/**
31 * get_code() - read Unicode code point from UTF-8 stream
32 *
33 * @read_u8: - stream reader
34 * @src: - string buffer passed to stream reader, optional
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +010035 * Return: - Unicode code point, or -1
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020036 */
37static int get_code(u8 (*read_u8)(void *data), void *data)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020038{
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020039 s32 ch = 0;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020040
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020041 ch = read_u8(data);
42 if (!ch)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020043 return 0;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020044 if (ch >= 0xc2 && ch <= 0xf4) {
45 int code = 0;
46
47 if (ch >= 0xe0) {
48 if (ch >= 0xf0) {
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020049 /* 0xf0 - 0xf4 */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020050 ch &= 0x07;
51 code = ch << 18;
52 ch = read_u8(data);
53 if (ch < 0x80 || ch > 0xbf)
54 goto error;
55 ch &= 0x3f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020056 } else {
57 /* 0xe0 - 0xef */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020058 ch &= 0x0f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020059 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020060 code += ch << 12;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020061 if ((code >= 0xD800 && code <= 0xDFFF) ||
62 code >= 0x110000)
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020063 goto error;
64 ch = read_u8(data);
65 if (ch < 0x80 || ch > 0xbf)
66 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020067 }
68 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020069 ch &= 0x3f;
70 code += ch << 6;
71 ch = read_u8(data);
72 if (ch < 0x80 || ch > 0xbf)
73 goto error;
74 ch &= 0x3f;
75 ch += code;
76 } else if (ch >= 0x80) {
77 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020078 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020079 return ch;
80error:
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +010081 return -1;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020082}
83
84/**
85 * read_string() - read byte from character string
86 *
87 * @data: - pointer to string
88 * Return: - byte read
89 *
90 * The string pointer is incremented if it does not point to '\0'.
91 */
92static u8 read_string(void *data)
93
94{
95 const char **src = (const char **)data;
96 u8 c;
97
98 if (!src || !*src || !**src)
99 return 0;
100 c = **src;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200101 ++*src;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200102 return c;
103}
104
105/**
106 * read_console() - read byte from console
107 *
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200108 * @data - not used, needed to match interface
109 * Return: - byte read or 0 on error
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200110 */
111static u8 read_console(void *data)
112{
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200113 int ch;
114
Heinrich Schuchardtc670aee2020-10-07 18:11:48 +0200115 ch = getchar();
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200116 if (ch < 0)
117 ch = 0;
118 return ch;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200119}
120
121int console_read_unicode(s32 *code)
122{
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +0100123 for (;;) {
124 s32 c;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200125
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +0100126 if (!tstc()) {
127 /* No input available */
128 return 1;
129 }
130
131 /* Read Unicode code */
132 c = get_code(read_console, NULL);
133 if (c > 0) {
134 *code = c;
135 return 0;
136 }
137 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200138}
139
140s32 utf8_get(const char **src)
141{
142 return get_code(read_string, src);
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200143}
144
145int utf8_put(s32 code, char **dst)
146{
147 if (!dst || !*dst)
148 return -1;
149 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
150 return -1;
151 if (code <= 0x007F) {
152 **dst = code;
153 } else {
154 if (code <= 0x07FF) {
155 **dst = code >> 6 | 0xC0;
156 } else {
157 if (code < 0x10000) {
158 **dst = code >> 12 | 0xE0;
159 } else {
160 **dst = code >> 18 | 0xF0;
161 ++*dst;
162 **dst = (code >> 12 & 0x3F) | 0x80;
163 }
164 ++*dst;
165 **dst = (code >> 6 & 0x3F) | 0x80;
166 }
167 ++*dst;
168 **dst = (code & 0x3F) | 0x80;
169 }
170 ++*dst;
171 return 0;
172}
173
174size_t utf8_utf16_strnlen(const char *src, size_t count)
175{
176 size_t len = 0;
177
178 for (; *src && count; --count) {
179 s32 code = utf8_get(&src);
180
181 if (!code)
182 break;
183 if (code < 0) {
184 /* Reserve space for a replacement character */
185 len += 1;
186 } else if (code < 0x10000) {
187 len += 1;
188 } else {
189 len += 2;
190 }
191 }
192 return len;
193}
194
195int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
196{
197 if (!src || !dst || !*dst)
198 return -1;
199
200 for (; count && *src; --count) {
201 s32 code = utf8_get(&src);
202
203 if (code < 0)
204 code = '?';
205 utf16_put(code, dst);
206 }
207 **dst = 0;
208 return 0;
209}
210
211s32 utf16_get(const u16 **src)
212{
213 s32 code, code2;
214
215 if (!src || !*src)
216 return -1;
217 if (!**src)
218 return 0;
219 code = **src;
220 ++*src;
221 if (code >= 0xDC00 && code <= 0xDFFF)
222 return -1;
223 if (code >= 0xD800 && code <= 0xDBFF) {
224 if (!**src)
225 return -1;
226 code &= 0x3ff;
227 code <<= 10;
228 code += 0x10000;
229 code2 = **src;
230 ++*src;
231 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
232 return -1;
233 code2 &= 0x3ff;
234 code += code2;
235 }
236 return code;
237}
238
239int utf16_put(s32 code, u16 **dst)
240{
241 if (!dst || !*dst)
242 return -1;
243 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
244 return -1;
245 if (code < 0x10000) {
246 **dst = code;
247 } else {
248 code -= 0x10000;
249 **dst = code >> 10 | 0xD800;
250 ++*dst;
251 **dst = (code & 0x3ff) | 0xDC00;
252 }
253 ++*dst;
254 return 0;
255}
256
257size_t utf16_strnlen(const u16 *src, size_t count)
258{
259 size_t len = 0;
260
261 for (; *src && count; --count) {
262 s32 code = utf16_get(&src);
263
264 if (!code)
265 break;
266 /*
267 * In case of an illegal sequence still reserve space for a
268 * replacement character.
269 */
270 ++len;
271 }
272 return len;
273}
274
275size_t utf16_utf8_strnlen(const u16 *src, size_t count)
276{
277 size_t len = 0;
278
279 for (; *src && count; --count) {
280 s32 code = utf16_get(&src);
281
282 if (!code)
283 break;
284 if (code < 0)
285 /* Reserve space for a replacement character */
286 len += 1;
287 else if (code < 0x80)
288 len += 1;
289 else if (code < 0x800)
290 len += 2;
291 else if (code < 0x10000)
292 len += 3;
293 else
294 len += 4;
295 }
296 return len;
297}
298
299int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
300{
301 if (!src || !dst || !*dst)
302 return -1;
303
304 for (; count && *src; --count) {
305 s32 code = utf16_get(&src);
306
307 if (code < 0)
308 code = '?';
309 utf8_put(code, dst);
310 }
311 **dst = 0;
312 return 0;
313}
314
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +0200315s32 utf_to_lower(const s32 code)
316{
317 struct capitalization_table *pos = capitalization_table;
318 s32 ret = code;
319
320 if (code <= 0x7f) {
321 if (code >= 'A' && code <= 'Z')
322 ret += 0x20;
323 return ret;
324 }
325 for (; pos->upper; ++pos) {
326 if (pos->upper == code) {
327 ret = pos->lower;
328 break;
329 }
330 }
331 return ret;
332}
333
334s32 utf_to_upper(const s32 code)
335{
336 struct capitalization_table *pos = capitalization_table;
337 s32 ret = code;
338
339 if (code <= 0x7f) {
340 if (code >= 'a' && code <= 'z')
341 ret -= 0x20;
342 return ret;
343 }
344 for (; pos->lower; ++pos) {
345 if (pos->lower == code) {
346 ret = pos->upper;
347 break;
348 }
349 }
350 return ret;
351}
Rob Clark78178bb2017-09-09 06:47:40 -0400352
AKASHI Takahirof8062c92019-09-18 10:26:29 +0900353/*
Heinrich Schuchardt7a9b3662022-12-29 14:44:03 +0100354 * u16_strcasecmp() - compare two u16 strings case insensitively
355 *
356 * @s1: first string to compare
357 * @s2: second string to compare
358 * @n: maximum number of u16 to compare
359 * Return: 0 if the first n u16 are the same in s1 and s2
360 * < 0 if the first different u16 in s1 is less than the
361 * corresponding u16 in s2
362 * > 0 if the first different u16 in s1 is greater than the
363 */
364int u16_strcasecmp(const u16 *s1, const u16 *s2)
365{
366 int ret = 0;
367 s32 c1, c2;
368
369 for (;;) {
370 c1 = utf_to_upper(utf16_get(&s1));
371 c2 = utf_to_upper(utf16_get(&s2));
372 ret = c1 - c2;
373 if (ret || !c1 || c1 == -1 || c2 == -1)
374 break;
375 }
376 return ret;
377}
378
379/*
AKASHI Takahirof8062c92019-09-18 10:26:29 +0900380 * u16_strncmp() - compare two u16 string
381 *
382 * @s1: first string to compare
383 * @s2: second string to compare
384 * @n: maximum number of u16 to compare
385 * Return: 0 if the first n u16 are the same in s1 and s2
386 * < 0 if the first different u16 in s1 is less than the
387 * corresponding u16 in s2
388 * > 0 if the first different u16 in s1 is greater than the
389 * corresponding u16 in s2
390 */
391int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
392{
393 int ret = 0;
394
395 for (; n; --n, ++s1, ++s2) {
396 ret = *s1 - *s2;
397 if (ret || !*s1)
398 break;
399 }
400
401 return ret;
402}
403
Ilias Apalodimas6974a4a2020-11-22 15:10:26 +0200404size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
Rob Clark78178bb2017-09-09 06:47:40 -0400405{
406 size_t i;
407 for (i = 0; count-- && in[i]; i++);
408 return i;
409}
410
Sughosh Ganu4835d352020-05-06 22:12:41 +0300411size_t u16_strsize(const void *in)
412{
413 return (u16_strlen(in) + 1) * sizeof(u16);
414}
415
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900416u16 *u16_strcpy(u16 *dest, const u16 *src)
417{
418 u16 *tmp = dest;
419
420 for (;; dest++, src++) {
421 *dest = *src;
422 if (!*src)
423 break;
424 }
425
426 return tmp;
427}
428
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200429u16 *u16_strdup(const void *src)
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900430{
431 u16 *new;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200432 size_t len;
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900433
434 if (!src)
435 return NULL;
Heinrich Schuchardt967407d2022-04-02 11:46:59 +0200436 len = u16_strsize(src);
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200437 new = malloc(len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900438 if (!new)
439 return NULL;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200440 memcpy(new, src, len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900441
442 return new;
443}
444
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900445size_t u16_strlcat(u16 *dest, const u16 *src, size_t count)
446{
Matthias Schiffer7c00b802023-07-14 13:24:51 +0200447 size_t destlen = u16_strnlen(dest, count);
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900448 size_t srclen = u16_strlen(src);
Matthias Schiffer7c00b802023-07-14 13:24:51 +0200449 size_t ret = destlen + srclen;
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900450
451 if (destlen >= count)
452 return ret;
Matthias Schiffer7c00b802023-07-14 13:24:51 +0200453 if (ret >= count)
454 srclen -= (ret - count + 1);
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900455 memcpy(&dest[destlen], src, 2 * srclen);
456 dest[destlen + srclen] = 0x0000;
457
458 return ret;
459}
460
Rob Clark78178bb2017-09-09 06:47:40 -0400461/* Convert UTF-16 to UTF-8. */
462uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
463{
464 uint32_t code_high = 0;
465
466 while (size--) {
467 uint32_t code = *src++;
468
469 if (code_high) {
470 if (code >= 0xDC00 && code <= 0xDFFF) {
471 /* Surrogate pair. */
472 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
473
474 *dest++ = (code >> 18) | 0xF0;
475 *dest++ = ((code >> 12) & 0x3F) | 0x80;
476 *dest++ = ((code >> 6) & 0x3F) | 0x80;
477 *dest++ = (code & 0x3F) | 0x80;
478 } else {
479 /* Error... */
480 *dest++ = '?';
481 /* *src may be valid. Don't eat it. */
482 src--;
483 }
484
485 code_high = 0;
486 } else {
487 if (code <= 0x007F) {
488 *dest++ = code;
489 } else if (code <= 0x07FF) {
490 *dest++ = (code >> 6) | 0xC0;
491 *dest++ = (code & 0x3F) | 0x80;
492 } else if (code >= 0xD800 && code <= 0xDBFF) {
493 code_high = code;
494 continue;
495 } else if (code >= 0xDC00 && code <= 0xDFFF) {
496 /* Error... */
497 *dest++ = '?';
498 } else if (code < 0x10000) {
499 *dest++ = (code >> 12) | 0xE0;
500 *dest++ = ((code >> 6) & 0x3F) | 0x80;
501 *dest++ = (code & 0x3F) | 0x80;
502 } else {
503 *dest++ = (code >> 18) | 0xF0;
504 *dest++ = ((code >> 12) & 0x3F) | 0x80;
505 *dest++ = ((code >> 6) & 0x3F) | 0x80;
506 *dest++ = (code & 0x3F) | 0x80;
507 }
508 }
509 }
510
511 return dest;
512}
Heinrich Schuchardt73bb90c2021-02-27 14:08:36 +0100513
Heinrich Schuchardt73bb90c2021-02-27 14:08:36 +0100514int utf_to_cp(s32 *c, const u16 *codepage)
515{
516 if (*c >= 0x80) {
517 int j;
518
519 /* Look up codepage translation */
520 for (j = 0; j < 0x80; ++j) {
521 if (*c == codepage[j]) {
522 *c = j + 0x80;
523 return 0;
524 }
525 }
526 *c = '?';
527 return -ENOENT;
528 }
529 return 0;
530}
Heinrich Schuchardte91789e2021-02-27 14:08:38 +0100531
532int utf8_to_cp437_stream(u8 c, char *buffer)
533{
534 char *end;
535 const char *pos;
536 s32 s;
537 int ret;
538
539 for (;;) {
540 pos = buffer;
541 end = buffer + strlen(buffer);
542 *end++ = c;
543 *end = 0;
544 s = utf8_get(&pos);
545 if (s > 0) {
546 *buffer = 0;
547 ret = utf_to_cp(&s, codepage_437);
548 return s;
549 }
550 if (pos == end)
551 return 0;
552 *buffer = 0;
553 }
554}
555
556int utf8_to_utf32_stream(u8 c, char *buffer)
557{
558 char *end;
559 const char *pos;
560 s32 s;
561
562 for (;;) {
563 pos = buffer;
564 end = buffer + strlen(buffer);
565 *end++ = c;
566 *end = 0;
567 s = utf8_get(&pos);
568 if (s > 0) {
569 *buffer = 0;
570 return s;
571 }
572 if (pos == end)
573 return 0;
574 *buffer = 0;
575 }
576}