Libparserutils
utf16.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
12 #include <stdbool.h>
13 #include <stdlib.h>
14 #include <string.h>
15 
17 
28  size_t len, uint32_t *ucs4, size_t *clen)
29 {
30  const uint16_t *ss = (const uint16_t *) (const void *) s;
31 
32  if (s == NULL || ucs4 == NULL || clen == NULL)
33  return PARSERUTILS_BADPARM;
34 
35  if (len < 2)
36  return PARSERUTILS_NEEDDATA;
37 
38  if (*ss < 0xD800 || *ss > 0xDFFF) {
39  *ucs4 = *ss;
40  *clen = 2;
41  } else if (0xD800 <= *ss && *ss <= 0xDBFF) {
42  /* High-surrogate code unit. */
43  if (len < 4)
44  return PARSERUTILS_NEEDDATA;
45 
46  if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
47  /* We have a valid surrogate pair. */
48  *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
49  + (1<<16);
50  *clen = 4;
51  } else {
52  return PARSERUTILS_INVALID;
53  }
54  } else {
55  /* Low-surrogate code unit. */
56  return PARSERUTILS_INVALID;
57  }
58 
59  return PARSERUTILS_OK;
60 }
61 
71  size_t *len)
72 {
73  uint16_t *ss = (uint16_t *) (void *) s;
74  uint32_t l = 0;
75 
76  if (s == NULL || len == NULL)
77  return PARSERUTILS_BADPARM;
78  else if (ucs4 < 0x10000) {
79  *ss = (uint16_t) ucs4;
80  l = 2;
81  } else if (ucs4 < 0x110000) {
82  ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
83  ss[1] = 0xDC00 | (ucs4 & 0x3ff);
84  l = 4;
85  } else {
86  return PARSERUTILS_INVALID;
87  }
88 
89  *len = l;
90 
91  return PARSERUTILS_OK;
92 }
93 
103  size_t *len)
104 {
105  const uint16_t *ss = (const uint16_t *) (const void *) s;
106  const uint16_t *end = (const uint16_t *) (const void *) (s + max);
107  int l = 0;
108 
109  if (s == NULL || len == NULL)
110  return PARSERUTILS_BADPARM;
111 
112  while (ss < end) {
113  if (*ss < 0xD800 || 0xDFFF < *ss)
114  ss++;
115  else
116  ss += 2;
117 
118  l++;
119  }
120 
121  *len = l;
122 
123  return PARSERUTILS_OK;
124 }
125 
134  size_t *len)
135 {
136  const uint16_t *ss = (const uint16_t *) (const void *) s;
137 
138  if (s == NULL || len == NULL)
139  return PARSERUTILS_BADPARM;
140 
141  if (*ss < 0xD800 || 0xDFFF < *ss)
142  *len = 2;
143  else
144  *len = 4;
145 
146  return PARSERUTILS_OK;
147 }
148 
158 parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
159  uint32_t *prevoff)
160 {
161  const uint16_t *ss = (const uint16_t *) (const void *) s;
162 
163  if (s == NULL || prevoff == NULL)
164  return PARSERUTILS_BADPARM;
165 
166  if (off < 2)
167  *prevoff = 0;
168  else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
169  *prevoff = off - 2;
170  else
171  *prevoff = (off < 4) ? 0 : off - 4;
172 
173  return PARSERUTILS_OK;
174 }
175 
187  uint32_t off, uint32_t *nextoff)
188 {
189  const uint16_t *ss = (const uint16_t *) (const void *) s;
190 
191  if (s == NULL || off >= len || nextoff == NULL)
192  return PARSERUTILS_BADPARM;
193 
194  if (len - off < 4)
195  *nextoff = len;
196  else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
197  *nextoff = off + 2;
198  else
199  *nextoff = (len - off < 6) ? len : off + 4;
200 
201  return PARSERUTILS_OK;
202 }
203 
215  uint32_t len, uint32_t off, uint32_t *nextoff)
216 {
217  const uint16_t *ss = (const uint16_t *) (const void *) s;
218 
219  if (s == NULL || off >= len || nextoff == NULL)
220  return PARSERUTILS_BADPARM;
221 
222  while (1) {
223  if (len - off < 4) {
224  return PARSERUTILS_NEEDDATA;
225  } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
226  *nextoff = off + 2;
227  break;
228  } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
229  if (len - off < 6)
230  return PARSERUTILS_NEEDDATA;
231 
232  if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
233  *nextoff = off + 4;
234  break;
235  } else {
236  ss++;
237  off += 2;
238  }
239  }
240  }
241 
242  return PARSERUTILS_OK;
243 }
244 
PARSERUTILS_BADPARM
@ PARSERUTILS_BADPARM
Definition: errors.h:22
parserutils_charset_utf16_prev
parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off, uint32_t *prevoff)
Find previous legal UTF-16 char in string.
Definition: utf16.c:158
parserutils_charset_utf16_to_ucs4
parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, size_t len, uint32_t *ucs4, size_t *clen)
Convert a UTF-16 sequence into a single UCS-4 character.
Definition: utf16.c:27
PARSERUTILS_OK
@ PARSERUTILS_OK
Definition: errors.h:19
len
size_t len
Definition: codec_8859.c:23
parserutils_charset_utf16_next
parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff)
Find next legal UTF-16 char in string.
Definition: utf16.c:186
max
#define max(a, b)
Definition: utils.h:12
PARSERUTILS_INVALID
@ PARSERUTILS_INVALID
Definition: errors.h:23
parserutils_charset_utf16_length
parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max, size_t *len)
Calculate the length (in characters) of a bounded UTF-16 string.
Definition: utf16.c:102
utf16.h
parserutils_error
parserutils_error
Definition: errors.h:18
PARSERUTILS_NEEDDATA
@ PARSERUTILS_NEEDDATA
Definition: errors.h:25
parserutils_charset_utf16_char_byte_length
parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, size_t *len)
Calculate the length (in bytes) of a UTF-16 character.
Definition: utf16.c:133
parserutils_charset_utf16_next_paranoid
parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff)
Find next legal UTF-16 char in string.
Definition: utf16.c:214
parserutils_charset_utf16_from_ucs4
parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, size_t *len)
Convert a single UCS-4 character into a UTF-16 sequence.
Definition: utf16.c:70