CsString  2.0.1
cs_encoding.h
1 /***********************************************************************
2 *
3 * Copyright (c) 2017-2026 Barbara Geller
4 * Copyright (c) 2017-2026 Ansel Sermersheim
5 *
6 * This file is part of CsString.
7 *
8 * CsString is free software which is released under the BSD 2-Clause license.
9 * For license details refer to the LICENSE provided with this project.
10 *
11 * CsString is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 *
15 * https://opensource.org/licenses/BSD-2-Clause
16 *
17 ***********************************************************************/
18 
19 #ifndef LIB_CS_ENCODING_H
20 #define LIB_CS_ENCODING_H
21 
22 #include <cs_char.h>
23 
24 #include <stdint.h>
25 #include <vector>
26 
27 namespace CsString {
28 
29 template <typename Container, typename Value>
30 concept EncodingInsert = requires(Container r_str, Container::const_iterator r_iter, Value r_value)
31  { r_iter = r_str.insert(r_iter, r_value); };
32 
33 class LIB_CS_STRING_EXPORT utf8
34 {
35  public:
36  using size_type = std::ptrdiff_t;
37  using storage_unit = uint8_t;
38 
39  template <typename Iterator>
40  static Iterator advance(Iterator iter_begin, Iterator iter_end, size_type count)
41  {
42  auto iter = iter_begin;
43  storage_unit value = 0;
44 
45  while (iter != iter_end && count != 0) {
46 
47  value = *iter;
48  if (value < 0x80 || value > 0xBf) {
49  --count;
50  }
51 
52  ++iter;
53  }
54 
55  if (value >= 0xBF) {
56  while (iter != iter_end && *iter >= 0x80 && *iter <= 0xBF) {
57  ++iter;
58  }
59  }
60 
61  return iter;
62  }
63 
64  template <typename Iterator>
65  static size_type distance(Iterator iter_begin, Iterator iter_end)
66  {
67  size_type retval = 0;
68 
69  for (auto iter = iter_begin; iter != iter_end; ++iter) {
70  storage_unit value = *iter;
71 
72  if (value < 0x80 || value > 0xBF) {
73  // ascii or first byte of a multi byte sequence
74  ++retval;
75  }
76  }
77 
78  return retval;
79  }
80 
81  template <typename Container>
82  requires EncodingInsert<Container, uint32_t>
83  static typename Container::const_iterator insert(Container &str1,
84  typename Container::const_iterator iter, CsChar c, size_type count = 1)
85  {
86  uint32_t value = c.unicode();
87 
88  for (size_type x = 0; x < count; ++x) {
89  if (value <= 0x007F) {
90  iter = str1.insert(iter, value);
91 
92  } else if (value <= 0x07FF) {
93  iter = str1.insert(iter, ((value) & 0x3F) | 0x80);
94  iter = str1.insert(iter, ((value >> 6) & 0x1F) | 0xC0);
95 
96  } else if (value <= 0xFFFF) {
97  iter = str1.insert(iter, ((value ) & 0x3F) | 0x80);
98  iter = str1.insert(iter, ((value >> 6 ) & 0x3F) | 0x80);
99  iter = str1.insert(iter, ((value >> 12) & 0x0F) | 0xE0);
100 
101  } else {
102  iter = str1.insert(iter, ((value ) & 0x3F) | 0x80);
103  iter = str1.insert(iter, ((value >> 6 ) & 0x3F) | 0x80);
104  iter = str1.insert(iter, ((value >> 12) & 0x3F) | 0x80);
105  iter = str1.insert(iter, ((value >> 18) & 0x07) | 0xF0);
106 
107  }
108  }
109 
110  return iter;
111  }
112 
113  static size_type walk(size_type len, std::vector<storage_unit>::const_iterator iter)
114  {
115  size_type retval = 0;
116  size_type count = 0;
117 
118  if (len >= 0) {
119  // walk forward
120 
121  for (size_type x = 0; x < len; ++x) {
122  uint8_t value = *iter;
123 
124  count = numOfBytes(value);
125  iter += count;
126 
127  retval += count;
128  }
129 
130  } else {
131  // walk backwards
132 
133  for (size_type x = 0; x > len; --x) {
134 
135  while (true) {
136  --iter;
137  --retval;
138 
139  uint8_t value = *iter;
140 
141  if ((value & 0xC0) != 0x80) {
142  // at the beginning of a char
143  break;
144  }
145  }
146  }
147  }
148 
149  return retval;
150  }
151 
152  static CsChar getCodePoint(std::vector<storage_unit>::const_iterator iter)
153  {
154  char32_t value = 0;
155  uint8_t tmp = *iter;
156 
157  if ((tmp & 0x80) == 0) {
158  value = tmp;
159 
160  } else if ((tmp & 0xE0) == 0xC0) {
161  value = (tmp & 0x1F) << 6;
162 
163  tmp = iter[1];
164  value |= (tmp & 0x3F);
165 
166 
167  } else if ((tmp & 0xF0) == 0xE0) {
168  value = (tmp & 0x0F) << 12;
169 
170  tmp = iter[1];
171  value |= (tmp & 0x3F) << 6;
172 
173  tmp = iter[2];
174  value |= (tmp & 0x3F);
175 
176  } else {
177  value = (tmp & 0x07) << 18;
178 
179  tmp = iter[1];
180  value |= (tmp & 0x3F) << 12;
181 
182  tmp = iter[2];
183  value |= (tmp & 0x3F) << 6;
184 
185  tmp = iter[3];
186  value |= (tmp & 0x3F);
187 
188  }
189 
190  return CsChar(value);
191  }
192 
193  private:
194  static size_type numOfBytes(uint8_t value)
195  {
196  if ((value & 0x80) == 0) {
197  return 1;
198 
199  } else if ((value & 0xE0) == 0xC0) {
200  return 2;
201 
202  } else if ((value & 0xF0) == 0xE0) {
203  return 3;
204 
205  } else if ((value & 0xF8) == 0xF0) {
206  return 4;
207 
208  }
209 
210  return 1;
211  }
212 };
213 
214 class LIB_CS_STRING_EXPORT utf16
215 {
216  public:
217  using size_type = std::ptrdiff_t;
218  using storage_unit = uint16_t;
219 
220  template <typename Iterator>
221  static Iterator advance(Iterator iter_begin, Iterator iter_end, size_type count)
222  {
223  auto iter = iter_begin;
224  storage_unit value = 0;
225 
226  while (iter != iter_end && count != 0) {
227 
228  value = *iter;
229  if (value < 0xDC00 || value > 0xDFFF) {
230  // not a low surrogate
231  --count;
232  }
233 
234  ++iter;
235  }
236 
237  if (value >= 0xD800 && value <= 0xDBFF) {
238  ++iter;
239  }
240 
241  return iter;
242  }
243 
244  template <typename Iterator>
245  static size_type distance(Iterator iter_begin, Iterator iter_end)
246  {
247  size_type retval = 0;
248 
249  for (auto iter = iter_begin; iter != iter_end; ++iter) {
250  storage_unit value = *iter;
251 
252  if (value < 0xDC00 || value > 0xDFFF) {
253  // not a low surrogate
254  ++retval;
255  }
256  }
257 
258  return retval;
259  }
260 
261  template <typename Container>
262  static typename Container::const_iterator insert(Container &str1,
263  typename Container::const_iterator iter, CsChar c, size_type count = 1)
264  {
265  uint32_t value = c.unicode();
266 
267  for (size_type x = 0; x < count; ++x) {
268 
269  if ((value <= 0xD7FF) || ((value >= 0xE000) && (value <= 0xFFFF))) {
270  iter = str1.insert(iter, value);
271 
272  } else {
273  value -= 0x010000;
274 
275  iter = str1.insert(iter, ((value ) & 0x03FF) + 0xDC00);
276  iter = str1.insert(iter, ((value >> 10) & 0x03FF) + 0xD800);
277  }
278 
279  }
280 
281  return iter;
282  }
283 
284  static size_type walk(size_type len, std::vector<storage_unit>::const_iterator iter)
285  {
286  size_type retval = 0;
287  size_type count = 0;
288 
289  if (len >= 0) {
290  // walk forward
291 
292  for (size_type x = 0; x < len; ++x) {
293  uint16_t value = *iter;
294 
295  count = numOfBytes(value);
296  iter += count;
297 
298  retval += count;
299  }
300 
301  } else {
302  // walk backwards
303 
304  for (size_type x = 0; x > len; --x) {
305 
306  while (true) {
307  --iter;
308  --retval;
309 
310  uint16_t value = *iter;
311 
312  if ((value & 0xFC00) != 0xDC00) {
313  // at the beginning of a char
314  break;
315  }
316  }
317 
318  // inside of the for loop
319  }
320  }
321 
322  return retval;
323  }
324 
325  static CsChar getCodePoint(std::vector<storage_unit>::const_iterator iter)
326  {
327  char32_t value = 0;
328  uint16_t tmp = *iter;
329 
330  if ((tmp & 0xFC00) != 0xD800) {
331  value = tmp;
332 
333  } else {
334  value = (tmp & 0x03FF) << 10;
335 
336  tmp = iter[1];
337  value |= (tmp & 0x03FF);
338  value |= 0x010000;
339  }
340 
341  return CsChar(value);
342  }
343 
344  private:
345  static size_type numOfBytes(uint16_t value)
346  {
347  if ((value & 0xFC00) == 0xD800) {
348  return 2;
349  }
350 
351  return 1;
352  }
353 };
354 
355 }
356 
357 #endif
#define LIB_CS_STRING_EXPORT
Definition: cs_char.h:35