blob: 72d745da4f4e2ab9cb39ae476ca5de892c8b74e0 [file] [log] [blame]
Tom Rinif739fcd2018-05-07 17:02:21 -04001// SPDX-License-Identifier: GPL-2.0+
Rob Clark78178bb2017-09-09 06:47:40 -04002/*
3 * charset conversion utils
4 *
5 * Copyright (c) 2017 Rob Clark
Rob Clark78178bb2017-09-09 06:47:40 -04006 */
7
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +02008#include <common.h>
Rob Clark78178bb2017-09-09 06:47:40 -04009#include <charset.h>
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020010#include <capitalization.h>
Rob Clark78178bb2017-09-09 06:47:40 -040011#include <malloc.h>
12
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020013static struct capitalization_table capitalization_table[] =
14#ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
15 UNICODE_CAPITALIZATION_TABLE;
16#elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
17 CP1250_CAPITALIZATION_TABLE;
18#else
19 CP437_CAPITALIZATION_TABLE;
20#endif
21
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020022/**
23 * get_code() - read Unicode code point from UTF-8 stream
24 *
25 * @read_u8: - stream reader
26 * @src: - string buffer passed to stream reader, optional
27 * Return: - Unicode code point
28 */
29static int get_code(u8 (*read_u8)(void *data), void *data)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020030{
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020031 s32 ch = 0;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020032
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020033 ch = read_u8(data);
34 if (!ch)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020035 return 0;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020036 if (ch >= 0xc2 && ch <= 0xf4) {
37 int code = 0;
38
39 if (ch >= 0xe0) {
40 if (ch >= 0xf0) {
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020041 /* 0xf0 - 0xf4 */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020042 ch &= 0x07;
43 code = ch << 18;
44 ch = read_u8(data);
45 if (ch < 0x80 || ch > 0xbf)
46 goto error;
47 ch &= 0x3f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020048 } else {
49 /* 0xe0 - 0xef */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020050 ch &= 0x0f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020051 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020052 code += ch << 12;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020053 if ((code >= 0xD800 && code <= 0xDFFF) ||
54 code >= 0x110000)
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020055 goto error;
56 ch = read_u8(data);
57 if (ch < 0x80 || ch > 0xbf)
58 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020059 }
60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020061 ch &= 0x3f;
62 code += ch << 6;
63 ch = read_u8(data);
64 if (ch < 0x80 || ch > 0xbf)
65 goto error;
66 ch &= 0x3f;
67 ch += code;
68 } else if (ch >= 0x80) {
69 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020070 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020071 return ch;
72error:
73 return '?';
74}
75
76/**
77 * read_string() - read byte from character string
78 *
79 * @data: - pointer to string
80 * Return: - byte read
81 *
82 * The string pointer is incremented if it does not point to '\0'.
83 */
84static u8 read_string(void *data)
85
86{
87 const char **src = (const char **)data;
88 u8 c;
89
90 if (!src || !*src || !**src)
91 return 0;
92 c = **src;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020093 ++*src;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020094 return c;
95}
96
97/**
98 * read_console() - read byte from console
99 *
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200100 * @data - not used, needed to match interface
101 * Return: - byte read or 0 on error
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200102 */
103static u8 read_console(void *data)
104{
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200105 int ch;
106
107 ch = getc();
108 if (ch < 0)
109 ch = 0;
110 return ch;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200111}
112
113int console_read_unicode(s32 *code)
114{
115 if (!tstc()) {
116 /* No input available */
117 return 1;
118 }
119
120 /* Read Unicode code */
121 *code = get_code(read_console, NULL);
122 return 0;
123}
124
125s32 utf8_get(const char **src)
126{
127 return get_code(read_string, src);
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200128}
129
130int utf8_put(s32 code, char **dst)
131{
132 if (!dst || !*dst)
133 return -1;
134 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
135 return -1;
136 if (code <= 0x007F) {
137 **dst = code;
138 } else {
139 if (code <= 0x07FF) {
140 **dst = code >> 6 | 0xC0;
141 } else {
142 if (code < 0x10000) {
143 **dst = code >> 12 | 0xE0;
144 } else {
145 **dst = code >> 18 | 0xF0;
146 ++*dst;
147 **dst = (code >> 12 & 0x3F) | 0x80;
148 }
149 ++*dst;
150 **dst = (code >> 6 & 0x3F) | 0x80;
151 }
152 ++*dst;
153 **dst = (code & 0x3F) | 0x80;
154 }
155 ++*dst;
156 return 0;
157}
158
159size_t utf8_utf16_strnlen(const char *src, size_t count)
160{
161 size_t len = 0;
162
163 for (; *src && count; --count) {
164 s32 code = utf8_get(&src);
165
166 if (!code)
167 break;
168 if (code < 0) {
169 /* Reserve space for a replacement character */
170 len += 1;
171 } else if (code < 0x10000) {
172 len += 1;
173 } else {
174 len += 2;
175 }
176 }
177 return len;
178}
179
180int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
181{
182 if (!src || !dst || !*dst)
183 return -1;
184
185 for (; count && *src; --count) {
186 s32 code = utf8_get(&src);
187
188 if (code < 0)
189 code = '?';
190 utf16_put(code, dst);
191 }
192 **dst = 0;
193 return 0;
194}
195
196s32 utf16_get(const u16 **src)
197{
198 s32 code, code2;
199
200 if (!src || !*src)
201 return -1;
202 if (!**src)
203 return 0;
204 code = **src;
205 ++*src;
206 if (code >= 0xDC00 && code <= 0xDFFF)
207 return -1;
208 if (code >= 0xD800 && code <= 0xDBFF) {
209 if (!**src)
210 return -1;
211 code &= 0x3ff;
212 code <<= 10;
213 code += 0x10000;
214 code2 = **src;
215 ++*src;
216 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
217 return -1;
218 code2 &= 0x3ff;
219 code += code2;
220 }
221 return code;
222}
223
224int utf16_put(s32 code, u16 **dst)
225{
226 if (!dst || !*dst)
227 return -1;
228 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
229 return -1;
230 if (code < 0x10000) {
231 **dst = code;
232 } else {
233 code -= 0x10000;
234 **dst = code >> 10 | 0xD800;
235 ++*dst;
236 **dst = (code & 0x3ff) | 0xDC00;
237 }
238 ++*dst;
239 return 0;
240}
241
242size_t utf16_strnlen(const u16 *src, size_t count)
243{
244 size_t len = 0;
245
246 for (; *src && count; --count) {
247 s32 code = utf16_get(&src);
248
249 if (!code)
250 break;
251 /*
252 * In case of an illegal sequence still reserve space for a
253 * replacement character.
254 */
255 ++len;
256 }
257 return len;
258}
259
260size_t utf16_utf8_strnlen(const u16 *src, size_t count)
261{
262 size_t len = 0;
263
264 for (; *src && count; --count) {
265 s32 code = utf16_get(&src);
266
267 if (!code)
268 break;
269 if (code < 0)
270 /* Reserve space for a replacement character */
271 len += 1;
272 else if (code < 0x80)
273 len += 1;
274 else if (code < 0x800)
275 len += 2;
276 else if (code < 0x10000)
277 len += 3;
278 else
279 len += 4;
280 }
281 return len;
282}
283
284int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
285{
286 if (!src || !dst || !*dst)
287 return -1;
288
289 for (; count && *src; --count) {
290 s32 code = utf16_get(&src);
291
292 if (code < 0)
293 code = '?';
294 utf8_put(code, dst);
295 }
296 **dst = 0;
297 return 0;
298}
299
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +0200300s32 utf_to_lower(const s32 code)
301{
302 struct capitalization_table *pos = capitalization_table;
303 s32 ret = code;
304
305 if (code <= 0x7f) {
306 if (code >= 'A' && code <= 'Z')
307 ret += 0x20;
308 return ret;
309 }
310 for (; pos->upper; ++pos) {
311 if (pos->upper == code) {
312 ret = pos->lower;
313 break;
314 }
315 }
316 return ret;
317}
318
319s32 utf_to_upper(const s32 code)
320{
321 struct capitalization_table *pos = capitalization_table;
322 s32 ret = code;
323
324 if (code <= 0x7f) {
325 if (code >= 'a' && code <= 'z')
326 ret -= 0x20;
327 return ret;
328 }
329 for (; pos->lower; ++pos) {
330 if (pos->lower == code) {
331 ret = pos->upper;
332 break;
333 }
334 }
335 return ret;
336}
Rob Clark78178bb2017-09-09 06:47:40 -0400337
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200338size_t u16_strlen(const void *in)
Rob Clark78178bb2017-09-09 06:47:40 -0400339{
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200340 const char *pos = in;
341 size_t ret;
342
343 for (; pos[0] || pos[1]; pos += 2)
344 ;
345 ret = pos - (char *)in;
346 ret >>= 1;
347 return ret;
Rob Clark78178bb2017-09-09 06:47:40 -0400348}
349
Heinrich Schuchardt1dde0d52018-08-31 21:31:26 +0200350size_t u16_strnlen(const u16 *in, size_t count)
Rob Clark78178bb2017-09-09 06:47:40 -0400351{
352 size_t i;
353 for (i = 0; count-- && in[i]; i++);
354 return i;
355}
356
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900357u16 *u16_strcpy(u16 *dest, const u16 *src)
358{
359 u16 *tmp = dest;
360
361 for (;; dest++, src++) {
362 *dest = *src;
363 if (!*src)
364 break;
365 }
366
367 return tmp;
368}
369
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200370u16 *u16_strdup(const void *src)
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900371{
372 u16 *new;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200373 size_t len;
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900374
375 if (!src)
376 return NULL;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200377 len = (u16_strlen(src) + 1) * sizeof(u16);
378 new = malloc(len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900379 if (!new)
380 return NULL;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200381 memcpy(new, src, len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900382
383 return new;
384}
385
Rob Clark78178bb2017-09-09 06:47:40 -0400386/* Convert UTF-16 to UTF-8. */
387uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
388{
389 uint32_t code_high = 0;
390
391 while (size--) {
392 uint32_t code = *src++;
393
394 if (code_high) {
395 if (code >= 0xDC00 && code <= 0xDFFF) {
396 /* Surrogate pair. */
397 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
398
399 *dest++ = (code >> 18) | 0xF0;
400 *dest++ = ((code >> 12) & 0x3F) | 0x80;
401 *dest++ = ((code >> 6) & 0x3F) | 0x80;
402 *dest++ = (code & 0x3F) | 0x80;
403 } else {
404 /* Error... */
405 *dest++ = '?';
406 /* *src may be valid. Don't eat it. */
407 src--;
408 }
409
410 code_high = 0;
411 } else {
412 if (code <= 0x007F) {
413 *dest++ = code;
414 } else if (code <= 0x07FF) {
415 *dest++ = (code >> 6) | 0xC0;
416 *dest++ = (code & 0x3F) | 0x80;
417 } else if (code >= 0xD800 && code <= 0xDBFF) {
418 code_high = code;
419 continue;
420 } else if (code >= 0xDC00 && code <= 0xDFFF) {
421 /* Error... */
422 *dest++ = '?';
423 } else if (code < 0x10000) {
424 *dest++ = (code >> 12) | 0xE0;
425 *dest++ = ((code >> 6) & 0x3F) | 0x80;
426 *dest++ = (code & 0x3F) | 0x80;
427 } else {
428 *dest++ = (code >> 18) | 0xF0;
429 *dest++ = ((code >> 12) & 0x3F) | 0x80;
430 *dest++ = ((code >> 6) & 0x3F) | 0x80;
431 *dest++ = (code & 0x3F) | 0x80;
432 }
433 }
434 }
435
436 return dest;
437}