CsString  1.4.0
cs_encoding.h
1 /***********************************************************************
2 *
3 * Copyright (c) 2017-2024 Barbara Geller
4 * Copyright (c) 2017-2024 Ansel Sermersheim
5 *
6 * This file is part of CsString.
7 *
8 * CsString is free software, released under the BSD 2-Clause license.
9 * For license details refer to LICENSE provided with this project.
10 *
11 * CsString is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 *
15 * https://opensource.org/licenses/BSD-2-Clause
16 *
17 ***********************************************************************/
18 
19 #ifndef LIB_CS_ENCODING_H
20 #define LIB_CS_ENCODING_H
21 
22 #include <stdint.h>
23 #include <vector>
24 
25 #include <cs_char.h>
26 
27 namespace CsString {
28 
29 class LIB_CS_STRING_EXPORT utf8
30 {
31  public:
32  using size_type = std::ptrdiff_t;
33  using storage_unit = uint8_t;
34 
35  template <typename Iterator>
36  static Iterator advance(Iterator iter_begin, Iterator iter_end, size_type count)
37  {
38  auto iter = iter_begin;
39  storage_unit value = 0;
40 
41  while (iter != iter_end && count != 0) {
42 
43  value = *iter;
44  if (value < 0x80 || value > 0xBf) {
45  --count;
46  }
47 
48  ++iter;
49  }
50 
51  if (value >= 0xBF) {
52  while (iter != iter_end && *iter >= 0x80 && *iter <= 0xBF) {
53  ++iter;
54  }
55  }
56 
57  return iter;
58  }
59 
60  template <typename Iterator>
61  static size_type distance(Iterator iter_begin, Iterator iter_end)
62  {
63  size_type retval = 0;
64 
65  for (auto iter = iter_begin; iter != iter_end; ++iter) {
66  storage_unit value = *iter;
67 
68  if (value < 0x80 || value > 0xBF) {
69  // ascii or first byte of a multi byte sequence
70  ++retval;
71  }
72  }
73 
74  return retval;
75  }
76 
77  template <typename Container>
78  static typename Container::const_iterator insert(Container &str1,
79  typename Container::const_iterator iter, CsChar c, size_type count = 1)
80  {
81  uint32_t value = c.unicode();
82 
83  for (size_type x = 0; x < count; ++x) {
84  if (value <= 0x007F) {
85  iter = str1.insert(iter, value);
86 
87  } else if (value <= 0x07FF) {
88  iter = str1.insert(iter, ((value) & 0x3F) | 0x80);
89  iter = str1.insert(iter, ((value >> 6) & 0x1F) | 0xC0);
90 
91  } else if (value <= 0xFFFF) {
92  iter = str1.insert(iter, ((value ) & 0x3F) | 0x80);
93  iter = str1.insert(iter, ((value >> 6 ) & 0x3F) | 0x80);
94  iter = str1.insert(iter, ((value >> 12) & 0x0F) | 0xE0);
95 
96  } else {
97  iter = str1.insert(iter, ((value ) & 0x3F) | 0x80);
98  iter = str1.insert(iter, ((value >> 6 ) & 0x3F) | 0x80);
99  iter = str1.insert(iter, ((value >> 12) & 0x3F) | 0x80);
100  iter = str1.insert(iter, ((value >> 18) & 0x07) | 0xF0);
101 
102  }
103  }
104 
105  return iter;
106  }
107 
108  static size_type walk(size_type len, std::vector<storage_unit>::const_iterator iter)
109  {
110  size_type retval = 0;
111  size_type count = 0;
112 
113  if (len >= 0) {
114  // walk forward
115 
116  for (size_type x = 0; x < len; ++x) {
117  uint8_t value = *iter;
118 
119  count = numOfBytes(value);
120  iter += count;
121 
122  retval += count;
123  }
124 
125  } else {
126  // walk backwards
127 
128  for (size_type x = 0; x > len; --x) {
129 
130  while (true) {
131  --iter;
132  --retval;
133 
134  uint8_t value = *iter;
135 
136  if ((value & 0xC0) != 0x80) {
137  // at the beginning of a char
138  break;
139  }
140  }
141  }
142  }
143 
144  return retval;
145  }
146 
147  static CsChar getCodePoint(std::vector<storage_unit>::const_iterator iter)
148  {
149  char32_t value = 0;
150  uint8_t tmp = *iter;
151 
152  if ((tmp & 0x80) == 0) {
153  value = tmp;
154 
155  } else if ((tmp & 0xE0) == 0xC0) {
156  value = (tmp & 0x1F) << 6;
157 
158  tmp = iter[1];
159  value |= (tmp & 0x3F);
160 
161 
162  } else if ((tmp & 0xF0) == 0xE0) {
163  value = (tmp & 0x0F) << 12;
164 
165  tmp = iter[1];
166  value |= (tmp & 0x3F) << 6;
167 
168  tmp = iter[2];
169  value |= (tmp & 0x3F);
170 
171  } else {
172  value = (tmp & 0x07) << 18;
173 
174  tmp = iter[1];
175  value |= (tmp & 0x3F) << 12;
176 
177  tmp = iter[2];
178  value |= (tmp & 0x3F) << 6;
179 
180  tmp = iter[3];
181  value |= (tmp & 0x3F);
182 
183  }
184 
185  return CsChar(value);
186  }
187 
188  private:
189  static size_type numOfBytes(uint8_t value)
190  {
191  if ((value & 0x80) == 0) {
192  return 1;
193 
194  } else if ((value & 0xE0) == 0xC0) {
195  return 2;
196 
197  } else if ((value & 0xF0) == 0xE0) {
198  return 3;
199 
200  } else if ((value & 0xF8) == 0xF0) {
201  return 4;
202 
203  }
204 
205  return 1;
206  }
207 };
208 
209 class LIB_CS_STRING_EXPORT utf16
210 {
211  public:
212  using size_type = std::ptrdiff_t;
213  using storage_unit = uint16_t;
214 
215  template <typename Iterator>
216  static Iterator advance(Iterator iter_begin, Iterator iter_end, size_type count)
217  {
218  auto iter = iter_begin;
219  storage_unit value = 0;
220 
221  while (iter != iter_end && count != 0) {
222 
223  value = *iter;
224  if (value < 0xDC00 || value > 0xDFFF) {
225  // not a low surrogate
226  --count;
227  }
228 
229  ++iter;
230  }
231 
232  if (value >= 0xD800 && value <= 0xDBFF) {
233  ++iter;
234  }
235 
236  return iter;
237  }
238 
239  template <typename Iterator>
240  static size_type distance(Iterator iter_begin, Iterator iter_end)
241  {
242  size_type retval = 0;
243 
244  for (auto iter = iter_begin; iter != iter_end; ++iter) {
245  storage_unit value = *iter;
246 
247  if (value < 0xDC00 || value > 0xDFFF) {
248  // not a low surrogate
249  ++retval;
250  }
251  }
252 
253  return retval;
254  }
255 
256  template <typename Container>
257  static typename Container::const_iterator insert(Container &str1,
258  typename Container::const_iterator iter, CsChar c, size_type count = 1)
259  {
260  uint32_t value = c.unicode();
261 
262  for (size_type x = 0; x < count; ++x) {
263 
264  if ((value <= 0xD7FF) || ((value >= 0xE000) && (value <= 0xFFFF))) {
265  iter = str1.insert(iter, value);
266 
267  } else {
268  value -= 0x010000;
269 
270  iter = str1.insert(iter, ((value ) & 0x03FF) + 0xDC00);
271  iter = str1.insert(iter, ((value >> 10) & 0x03FF) + 0xD800);
272  }
273 
274  }
275 
276  return iter;
277  }
278 
279  static size_type walk(size_type len, std::vector<storage_unit>::const_iterator iter)
280  {
281  size_type retval = 0;
282  size_type count = 0;
283 
284  if (len >= 0) {
285  // walk forward
286 
287  for (size_type x = 0; x < len; ++x) {
288  uint16_t value = *iter;
289 
290  count = numOfBytes(value);
291  iter += count;
292 
293  retval += count;
294  }
295 
296  } else {
297  // walk backwards
298 
299  for (size_type x = 0; x > len; --x) {
300 
301  while (true) {
302  --iter;
303  --retval;
304 
305  uint16_t value = *iter;
306 
307  if ((value & 0xFC00) != 0xDC00) {
308  // at the beginning of a char
309  break;
310  }
311  }
312 
313  // inside of the for loop
314  }
315  }
316 
317  return retval;
318  }
319 
320  static CsChar getCodePoint(std::vector<storage_unit>::const_iterator iter)
321  {
322  char32_t value = 0;
323  uint16_t tmp = *iter;
324 
325  if ((tmp & 0xFC00) != 0xD800) {
326  value = tmp;
327 
328  } else {
329  value = (tmp & 0x03FF) << 10;
330 
331  tmp = iter[1];
332  value |= (tmp & 0x03FF);
333  value |= 0x010000;
334  }
335 
336  return CsChar(value);
337  }
338 
339  private:
340  static size_type numOfBytes(uint16_t value)
341  {
342  if ((value & 0xFC00) == 0xD800) {
343  return 2;
344  }
345 
346  return 1;
347  }
348 };
349 
350 }
351 
352 #endif
#define LIB_CS_STRING_EXPORT
Definition: cs_char.h:34