blob: 0cede9b60b4ccdc0bc9cc28f4c3cbee0b9d77dc6 [file] [log] [blame]
Tom Rinif739fcd2018-05-07 17:02:21 -04001// SPDX-License-Identifier: GPL-2.0+
Rob Clark78178bb2017-09-09 06:47:40 -04002/*
3 * charset conversion utils
4 *
5 * Copyright (c) 2017 Rob Clark
Rob Clark78178bb2017-09-09 06:47:40 -04006 */
7
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +02008#include <common.h>
Rob Clark78178bb2017-09-09 06:47:40 -04009#include <charset.h>
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020010#include <capitalization.h>
Rob Clark78178bb2017-09-09 06:47:40 -040011#include <malloc.h>
12
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020013static struct capitalization_table capitalization_table[] =
14#ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
15 UNICODE_CAPITALIZATION_TABLE;
16#elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
17 CP1250_CAPITALIZATION_TABLE;
18#else
19 CP437_CAPITALIZATION_TABLE;
20#endif
21
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020022/**
23 * get_code() - read Unicode code point from UTF-8 stream
24 *
25 * @read_u8: - stream reader
26 * @src: - string buffer passed to stream reader, optional
27 * Return: - Unicode code point
28 */
29static int get_code(u8 (*read_u8)(void *data), void *data)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020030{
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020031 s32 ch = 0;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020032
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020033 ch = read_u8(data);
34 if (!ch)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020035 return 0;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020036 if (ch >= 0xc2 && ch <= 0xf4) {
37 int code = 0;
38
39 if (ch >= 0xe0) {
40 if (ch >= 0xf0) {
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020041 /* 0xf0 - 0xf4 */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020042 ch &= 0x07;
43 code = ch << 18;
44 ch = read_u8(data);
45 if (ch < 0x80 || ch > 0xbf)
46 goto error;
47 ch &= 0x3f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020048 } else {
49 /* 0xe0 - 0xef */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020050 ch &= 0x0f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020051 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020052 code += ch << 12;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020053 if ((code >= 0xD800 && code <= 0xDFFF) ||
54 code >= 0x110000)
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020055 goto error;
56 ch = read_u8(data);
57 if (ch < 0x80 || ch > 0xbf)
58 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020059 }
60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020061 ch &= 0x3f;
62 code += ch << 6;
63 ch = read_u8(data);
64 if (ch < 0x80 || ch > 0xbf)
65 goto error;
66 ch &= 0x3f;
67 ch += code;
68 } else if (ch >= 0x80) {
69 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020070 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020071 return ch;
72error:
73 return '?';
74}
75
76/**
77 * read_string() - read byte from character string
78 *
79 * @data: - pointer to string
80 * Return: - byte read
81 *
82 * The string pointer is incremented if it does not point to '\0'.
83 */
84static u8 read_string(void *data)
85
86{
87 const char **src = (const char **)data;
88 u8 c;
89
90 if (!src || !*src || !**src)
91 return 0;
92 c = **src;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020093 ++*src;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020094 return c;
95}
96
97/**
98 * read_console() - read byte from console
99 *
100 * @src - not used, needed to match interface
101 * Return: - byte read
102 */
103static u8 read_console(void *data)
104{
105 return getc();
106}
107
108int console_read_unicode(s32 *code)
109{
110 if (!tstc()) {
111 /* No input available */
112 return 1;
113 }
114
115 /* Read Unicode code */
116 *code = get_code(read_console, NULL);
117 return 0;
118}
119
120s32 utf8_get(const char **src)
121{
122 return get_code(read_string, src);
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200123}
124
125int utf8_put(s32 code, char **dst)
126{
127 if (!dst || !*dst)
128 return -1;
129 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
130 return -1;
131 if (code <= 0x007F) {
132 **dst = code;
133 } else {
134 if (code <= 0x07FF) {
135 **dst = code >> 6 | 0xC0;
136 } else {
137 if (code < 0x10000) {
138 **dst = code >> 12 | 0xE0;
139 } else {
140 **dst = code >> 18 | 0xF0;
141 ++*dst;
142 **dst = (code >> 12 & 0x3F) | 0x80;
143 }
144 ++*dst;
145 **dst = (code >> 6 & 0x3F) | 0x80;
146 }
147 ++*dst;
148 **dst = (code & 0x3F) | 0x80;
149 }
150 ++*dst;
151 return 0;
152}
153
154size_t utf8_utf16_strnlen(const char *src, size_t count)
155{
156 size_t len = 0;
157
158 for (; *src && count; --count) {
159 s32 code = utf8_get(&src);
160
161 if (!code)
162 break;
163 if (code < 0) {
164 /* Reserve space for a replacement character */
165 len += 1;
166 } else if (code < 0x10000) {
167 len += 1;
168 } else {
169 len += 2;
170 }
171 }
172 return len;
173}
174
175int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
176{
177 if (!src || !dst || !*dst)
178 return -1;
179
180 for (; count && *src; --count) {
181 s32 code = utf8_get(&src);
182
183 if (code < 0)
184 code = '?';
185 utf16_put(code, dst);
186 }
187 **dst = 0;
188 return 0;
189}
190
191s32 utf16_get(const u16 **src)
192{
193 s32 code, code2;
194
195 if (!src || !*src)
196 return -1;
197 if (!**src)
198 return 0;
199 code = **src;
200 ++*src;
201 if (code >= 0xDC00 && code <= 0xDFFF)
202 return -1;
203 if (code >= 0xD800 && code <= 0xDBFF) {
204 if (!**src)
205 return -1;
206 code &= 0x3ff;
207 code <<= 10;
208 code += 0x10000;
209 code2 = **src;
210 ++*src;
211 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
212 return -1;
213 code2 &= 0x3ff;
214 code += code2;
215 }
216 return code;
217}
218
219int utf16_put(s32 code, u16 **dst)
220{
221 if (!dst || !*dst)
222 return -1;
223 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
224 return -1;
225 if (code < 0x10000) {
226 **dst = code;
227 } else {
228 code -= 0x10000;
229 **dst = code >> 10 | 0xD800;
230 ++*dst;
231 **dst = (code & 0x3ff) | 0xDC00;
232 }
233 ++*dst;
234 return 0;
235}
236
237size_t utf16_strnlen(const u16 *src, size_t count)
238{
239 size_t len = 0;
240
241 for (; *src && count; --count) {
242 s32 code = utf16_get(&src);
243
244 if (!code)
245 break;
246 /*
247 * In case of an illegal sequence still reserve space for a
248 * replacement character.
249 */
250 ++len;
251 }
252 return len;
253}
254
255size_t utf16_utf8_strnlen(const u16 *src, size_t count)
256{
257 size_t len = 0;
258
259 for (; *src && count; --count) {
260 s32 code = utf16_get(&src);
261
262 if (!code)
263 break;
264 if (code < 0)
265 /* Reserve space for a replacement character */
266 len += 1;
267 else if (code < 0x80)
268 len += 1;
269 else if (code < 0x800)
270 len += 2;
271 else if (code < 0x10000)
272 len += 3;
273 else
274 len += 4;
275 }
276 return len;
277}
278
279int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
280{
281 if (!src || !dst || !*dst)
282 return -1;
283
284 for (; count && *src; --count) {
285 s32 code = utf16_get(&src);
286
287 if (code < 0)
288 code = '?';
289 utf8_put(code, dst);
290 }
291 **dst = 0;
292 return 0;
293}
294
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +0200295s32 utf_to_lower(const s32 code)
296{
297 struct capitalization_table *pos = capitalization_table;
298 s32 ret = code;
299
300 if (code <= 0x7f) {
301 if (code >= 'A' && code <= 'Z')
302 ret += 0x20;
303 return ret;
304 }
305 for (; pos->upper; ++pos) {
306 if (pos->upper == code) {
307 ret = pos->lower;
308 break;
309 }
310 }
311 return ret;
312}
313
314s32 utf_to_upper(const s32 code)
315{
316 struct capitalization_table *pos = capitalization_table;
317 s32 ret = code;
318
319 if (code <= 0x7f) {
320 if (code >= 'a' && code <= 'z')
321 ret -= 0x20;
322 return ret;
323 }
324 for (; pos->lower; ++pos) {
325 if (pos->lower == code) {
326 ret = pos->upper;
327 break;
328 }
329 }
330 return ret;
331}
Rob Clark78178bb2017-09-09 06:47:40 -0400332
Heinrich Schuchardt1dde0d52018-08-31 21:31:26 +0200333size_t u16_strlen(const u16 *in)
Rob Clark78178bb2017-09-09 06:47:40 -0400334{
335 size_t i;
336 for (i = 0; in[i]; i++);
337 return i;
338}
339
Heinrich Schuchardt1dde0d52018-08-31 21:31:26 +0200340size_t u16_strnlen(const u16 *in, size_t count)
Rob Clark78178bb2017-09-09 06:47:40 -0400341{
342 size_t i;
343 for (i = 0; count-- && in[i]; i++);
344 return i;
345}
346
Rob Clark78178bb2017-09-09 06:47:40 -0400347/* Convert UTF-16 to UTF-8. */
348uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
349{
350 uint32_t code_high = 0;
351
352 while (size--) {
353 uint32_t code = *src++;
354
355 if (code_high) {
356 if (code >= 0xDC00 && code <= 0xDFFF) {
357 /* Surrogate pair. */
358 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
359
360 *dest++ = (code >> 18) | 0xF0;
361 *dest++ = ((code >> 12) & 0x3F) | 0x80;
362 *dest++ = ((code >> 6) & 0x3F) | 0x80;
363 *dest++ = (code & 0x3F) | 0x80;
364 } else {
365 /* Error... */
366 *dest++ = '?';
367 /* *src may be valid. Don't eat it. */
368 src--;
369 }
370
371 code_high = 0;
372 } else {
373 if (code <= 0x007F) {
374 *dest++ = code;
375 } else if (code <= 0x07FF) {
376 *dest++ = (code >> 6) | 0xC0;
377 *dest++ = (code & 0x3F) | 0x80;
378 } else if (code >= 0xD800 && code <= 0xDBFF) {
379 code_high = code;
380 continue;
381 } else if (code >= 0xDC00 && code <= 0xDFFF) {
382 /* Error... */
383 *dest++ = '?';
384 } else if (code < 0x10000) {
385 *dest++ = (code >> 12) | 0xE0;
386 *dest++ = ((code >> 6) & 0x3F) | 0x80;
387 *dest++ = (code & 0x3F) | 0x80;
388 } else {
389 *dest++ = (code >> 18) | 0xF0;
390 *dest++ = ((code >> 12) & 0x3F) | 0x80;
391 *dest++ = ((code >> 6) & 0x3F) | 0x80;
392 *dest++ = (code & 0x3F) | 0x80;
393 }
394 }
395 }
396
397 return dest;
398}