Libparserutils
codec_utf8.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdlib.h>
10 #include <string.h>
11 
13 
16 #include "utils/endian.h"
17 #include "utils/utils.h"
18 
22 typedef struct charset_utf8_codec {
25 #define INVAL_BUFSIZE (32)
29  size_t inval_len; /*< Byte length of inval_buf **/
30 
31 #define READ_BUFSIZE (8)
32  uint32_t read_buf[READ_BUFSIZE];
35  size_t read_len;
37 #define WRITE_BUFSIZE (8)
41  size_t write_len;
44 
45 static bool charset_utf8_codec_handles_charset(const char *charset);
46 static parserutils_error charset_utf8_codec_create(const char *charset,
52  const uint8_t **source, size_t *sourcelen,
53  uint8_t **dest, size_t *destlen);
56  const uint8_t **source, size_t *sourcelen,
57  uint8_t **dest, size_t *destlen);
62  const uint8_t **source, size_t *sourcelen,
63  uint8_t **dest, size_t *destlen);
66  uint32_t ucs4, uint8_t **dest, size_t *destlen);
67 
74 bool charset_utf8_codec_handles_charset(const char *charset)
75 {
77  strlen(charset)) ==
79  SLEN("UTF-8"));
80 }
81 
93 {
95 
96  UNUSED(charset);
97 
98  c = malloc(sizeof(charset_utf8_codec));
99  if (c == NULL)
100  return PARSERUTILS_NOMEM;
101 
102  c->inval_buf[0] = '\0';
103  c->inval_len = 0;
104 
105  c->read_buf[0] = 0;
106  c->read_len = 0;
107 
108  c->write_buf[0] = 0;
109  c->write_len = 0;
110 
111  /* Finally, populate vtable */
116 
117  *codec = (parserutils_charset_codec *) c;
118 
119  return PARSERUTILS_OK;
120 }
121 
129 {
130  UNUSED(codec);
131 
132  return PARSERUTILS_OK;
133 }
134 
163  const uint8_t **source, size_t *sourcelen,
164  uint8_t **dest, size_t *destlen)
165 {
166  charset_utf8_codec *c = (charset_utf8_codec *) codec;
167  uint32_t ucs4;
168  uint32_t *towrite;
169  size_t towritelen;
170  parserutils_error error;
171 
172  /* Process any outstanding characters from the previous call */
173  if (c->write_len > 0) {
174  uint32_t *pwrite = c->write_buf;
175 
176  while (c->write_len > 0) {
177  UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
178  if (error != PARSERUTILS_OK) {
179  uint32_t len;
180  assert(error == PARSERUTILS_NOMEM);
181 
182  /* Insufficient output buffer space */
183  for (len = 0; len < c->write_len; len++) {
184  c->write_buf[len] = pwrite[len];
185  }
186 
187  return PARSERUTILS_NOMEM;
188  }
189 
190  pwrite++;
191  c->write_len--;
192  }
193  }
194 
195  /* Now process the characters for this call */
196  while (*sourcelen > 0) {
197  ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
198  towrite = &ucs4;
199  towritelen = 1;
200 
201  /* Output current characters */
202  while (towritelen > 0) {
203  UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
204  if (error != PARSERUTILS_OK) {
205  uint32_t len;
206  assert(error == PARSERUTILS_NOMEM);
207 
208  /* Insufficient output space */
209  assert(towritelen < WRITE_BUFSIZE);
210 
211  c->write_len = towritelen;
212 
213  /* Copy pending chars to save area, for
214  * processing next call. */
215  for (len = 0; len < towritelen; len++)
216  c->write_buf[len] = towrite[len];
217 
218  /* Claim character we've just buffered,
219  * so it's not reprocessed */
220  *source += 4;
221  *sourcelen -= 4;
222 
223  return PARSERUTILS_NOMEM;
224  }
225 
226  towrite++;
227  towritelen--;
228  }
229 
230  *source += 4;
231  *sourcelen -= 4;
232  }
233 
234  return PARSERUTILS_OK;
235 }
236 
279  const uint8_t **source, size_t *sourcelen,
280  uint8_t **dest, size_t *destlen)
281 {
282  charset_utf8_codec *c = (charset_utf8_codec *) codec;
283  parserutils_error error;
284 
285  if (c->read_len > 0) {
286  /* Output left over from last decode */
287  uint32_t *pread = c->read_buf;
288 
289  while (c->read_len > 0 && *destlen >= c->read_len * 4) {
290  *((uint32_t *) (void *) *dest) =
291  endian_host_to_big(pread[0]);
292 
293  *dest += 4;
294  *destlen -= 4;
295 
296  pread++;
297  c->read_len--;
298  }
299 
300  if (*destlen < c->read_len * 4) {
301  /* Ran out of output buffer */
302  size_t i;
303 
304  /* Shuffle remaining output down */
305  for (i = 0; i < c->read_len; i++)
306  c->read_buf[i] = pread[i];
307 
308  return PARSERUTILS_NOMEM;
309  }
310  }
311 
312  if (c->inval_len > 0) {
313  /* The last decode ended in an incomplete sequence.
314  * Fill up inval_buf with data from the start of the
315  * new chunk and process it. */
316  uint8_t *in = c->inval_buf;
317  size_t ol = c->inval_len;
318  size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
319  size_t orig_l = l;
320 
321  memcpy(c->inval_buf + ol, *source, l);
322 
323  l += c->inval_len;
324 
326  (const uint8_t **) &in, &l, dest, destlen);
327  if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
328  return error;
329  }
330 
331  /* And now, fix up source pointers */
332  *source += max((signed) (orig_l - l), 0);
333  *sourcelen -= max((signed) (orig_l - l), 0);
334 
335  /* Failed to resolve an incomplete character and
336  * ran out of buffer space. No recovery strategy
337  * possible, so explode everywhere. */
338  assert((orig_l + ol) - l != 0);
339 
340  /* Report memory exhaustion case from above */
341  if (error != PARSERUTILS_OK)
342  return error;
343  }
344 
345  /* Finally, the "normal" case; process all outstanding characters */
346  while (*sourcelen > 0) {
348  source, sourcelen, dest, destlen);
349  if (error != PARSERUTILS_OK) {
350  return error;
351  }
352  }
353 
354  return PARSERUTILS_OK;
355 }
356 
364 {
365  charset_utf8_codec *c = (charset_utf8_codec *) codec;
366 
367  c->inval_buf[0] = '\0';
368  c->inval_len = 0;
369 
370  c->read_buf[0] = 0;
371  c->read_len = 0;
372 
373  c->write_buf[0] = 0;
374  c->write_len = 0;
375 
376  return PARSERUTILS_OK;
377 }
378 
379 
409  const uint8_t **source, size_t *sourcelen,
410  uint8_t **dest, size_t *destlen)
411 {
412  uint32_t ucs4;
413  size_t sucs4;
414  parserutils_error error;
415 
416  /* Convert a single character */
417  {
418  const uint8_t *src = *source;
419  size_t srclen = *sourcelen;
420  uint32_t *uptr = &ucs4;
421  size_t *usptr = &sucs4;
422  UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
423  }
424  if (error == PARSERUTILS_OK) {
425  /* Read a character */
427  ucs4, dest, destlen);
428  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
429  /* output succeeded; update source pointers */
430  *source += sucs4;
431  *sourcelen -= sucs4;
432  }
433 
434  /* Clear inval buffer */
435  c->inval_buf[0] = '\0';
436  c->inval_len = 0;
437 
438  return error;
439  } else if (error == PARSERUTILS_NEEDDATA) {
440  /* Incomplete input sequence */
441  assert(*sourcelen < INVAL_BUFSIZE);
442 
443  memmove(c->inval_buf, *source, *sourcelen);
444  c->inval_buf[*sourcelen] = '\0';
445  c->inval_len = *sourcelen;
446 
447  *source += *sourcelen;
448  *sourcelen = 0;
449 
450  return PARSERUTILS_OK;
451  } else if (error == PARSERUTILS_INVALID) {
452  /* Illegal input sequence */
453  uint32_t nextchar;
454 
455  /* Strict errormode; simply flag invalid character */
456  if (c->base.errormode ==
458  /* Clear inval buffer */
459  c->inval_buf[0] = '\0';
460  c->inval_len = 0;
461 
462  return PARSERUTILS_INVALID;
463  }
464 
465  /* Find next valid UTF-8 sequence.
466  * We're processing client-provided data, so let's
467  * be paranoid about its validity. */
468  {
469  const uint8_t *src = *source;
470  size_t srclen = *sourcelen;
471  uint32_t off = 0;
472  uint32_t *ncptr = &nextchar;
473 
474  UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
475  }
476  if (error != PARSERUTILS_OK) {
477  if (error == PARSERUTILS_NEEDDATA) {
478  /* Need more data to be sure */
479  assert(*sourcelen < INVAL_BUFSIZE);
480 
481  memmove(c->inval_buf, *source, *sourcelen);
482  c->inval_buf[*sourcelen] = '\0';
483  c->inval_len = *sourcelen;
484 
485  *source += *sourcelen;
486  *sourcelen = 0;
487 
488  nextchar = 0;
489  } else {
490  return error;
491  }
492  }
493 
494  /* Clear inval buffer */
495  c->inval_buf[0] = '\0';
496  c->inval_len = 0;
497 
498  /* output U+FFFD and continue processing. */
500  0xFFFD, dest, destlen);
501  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
502  /* output succeeded; update source pointers */
503  *source += nextchar;
504  *sourcelen -= nextchar;
505  }
506 
507  return error;
508  }
509 
510  return PARSERUTILS_OK;
511 }
512 
524  uint32_t ucs4, uint8_t **dest, size_t *destlen)
525 {
526  if (*destlen < 4) {
527  /* Run out of output buffer */
528  c->read_len = 1;
529  c->read_buf[0] = ucs4;
530 
531  return PARSERUTILS_NOMEM;
532  }
533 
534  *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
535  *dest += 4;
536  *destlen -= 4;
537 
538  return PARSERUTILS_OK;
539 }
540 
541 
545 };
546 
charset_utf8_codec::write_len
size_t write_len
Character length of write_buf.
Definition: codec_utf8.c:41
utf8impl.h
charset_utf8_codec::inval_buf
uint8_t inval_buf[INVAL_BUFSIZE]
Buffer for fixing up incomplete input sequences.
Definition: codec_utf8.c:26
parserutils_charset_codec::encode
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:26
INVAL_BUFSIZE
#define INVAL_BUFSIZE
Definition: codec_utf8.c:25
parserutils_charset_codec::handler
struct parserutils_charset_codec::@3 handler
Vtable for handler code.
charset_utf8_codec_create
static parserutils_error charset_utf8_codec_create(const char *charset, parserutils_charset_codec **codec)
Create a UTF-8 codec.
Definition: codec_utf8.c:91
parserutils_charset_codec::destroy
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition: codec_impl.h:25
PARSERUTILS_OK
@ PARSERUTILS_OK
Definition: errors.h:19
charset_utf8_codec::base
parserutils_charset_codec base
Base class.
Definition: codec_utf8.c:23
charset_utf8_codec::write_buf
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition: codec_utf8.c:38
len
size_t len
Definition: codec_8859.c:23
charset_utf8_codec_read_char
static parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the UTF-8 to UCS-4 (big endian)
Definition: codec_utf8.c:408
charset_utf8_codec_destroy
static parserutils_error charset_utf8_codec_destroy(parserutils_charset_codec *codec)
Destroy a UTF-8 codec.
Definition: codec_utf8.c:128
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition: codec.h:64
parserutils_charset_codec
Core charset codec definition; implementations extend this.
Definition: codec_impl.h:19
UTF8_NEXT_PARANOID
#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)
Skip to start of next sequence in UTF-8 input.
Definition: utf8impl.h:303
utils.h
charset_utf8_codec::read_buf
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition: codec_utf8.c:32
max
#define max(a, b)
Definition: utils.h:12
UTF8_FROM_UCS4
#define UTF8_FROM_UCS4(ucs4, s, len, error)
Convert a single UCS-4 character into a UTF-8 multibyte sequence.
Definition: utf8impl.h:123
charset_utf8_codec_encode
static parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into UTF-8.
Definition: codec_utf8.c:162
parserutils_charset_codec::reset
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition: codec_impl.h:32
charset_utf8_codec_handler
const parserutils_charset_handler charset_utf8_codec_handler
Definition: codec_utf8.c:542
PARSERUTILS_INVALID
@ PARSERUTILS_INVALID
Definition: errors.h:23
parserutils_charset_codec::errormode
parserutils_charset_codec_errormode errormode
error mode
Definition: codec_impl.h:22
charset_utf8_codec
struct charset_utf8_codec charset_utf8_codec
UTF-8 charset codec.
charset_utf8_codec_reset
static parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
Clear a UTF-8 codec's encoding state.
Definition: codec_utf8.c:363
codec_impl.h
charset_utf8_codec::inval_len
size_t inval_len
Definition: codec_utf8.c:29
WRITE_BUFSIZE
#define WRITE_BUFSIZE
Definition: codec_utf8.c:37
charset_utf8_codec::read_len
size_t read_len
Character length of read_buf.
Definition: codec_utf8.c:35
charset_utf8_codec_output_decoded_char
static parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition: codec_utf8.c:523
endian_big_to_host
static uint32_t endian_big_to_host(uint32_t big)
Definition: endian.h:32
parserutils_error
parserutils_error
Definition: errors.h:18
charset_utf8_codec_decode
static parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of UTF-8 data into UCS-4 (big endian)
Definition: codec_utf8.c:278
PARSERUTILS_NEEDDATA
@ PARSERUTILS_NEEDDATA
Definition: errors.h:25
SLEN
#define SLEN(s)
Definition: utils.h:21
parserutils_charset_handler
Codec factory component definition.
Definition: codec_impl.h:39
charset_utf8_codec_handles_charset
static bool charset_utf8_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_utf8.c:74
PARSERUTILS_NOMEM
@ PARSERUTILS_NOMEM
Definition: errors.h:21
min
#define min(a, b)
Definition: utils.h:16
UTF8_TO_UCS4
#define UTF8_TO_UCS4(s, len, ucs4, clen, error)
Convert a UTF-8 multibyte sequence into a single UCS-4 character.
Definition: utf8impl.h:34
READ_BUFSIZE
#define READ_BUFSIZE
Definition: codec_utf8.c:31
parserutils_charset_codec::decode
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:29
charset_utf8_codec
UTF-8 charset codec.
Definition: codec_utf8.c:22
mibenum.h
parserutils_charset_mibenum_from_name
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
endian_host_to_big
static uint32_t endian_host_to_big(uint32_t host)
Definition: endian.h:24
UNUSED
#define UNUSED(x)
Definition: utils.h:25
endian.h