Notcurses 3.0.13
a blingful library for TUIs and character graphics
Loading...
Searching...
No Matches
egcpool.h
Go to the documentation of this file.
1#ifndef NOTCURSES_EGCPOOL
2#define NOTCURSES_EGCPOOL
3
4#include <wchar.h>
5#include <errno.h>
6#include <stdio.h>
7#include <wctype.h>
8#include <stddef.h>
9#include <assert.h>
10#include <stdlib.h>
11#include <string.h>
12#include <stdbool.h>
13#include <unigbrk.h>
14#include <unictype.h>
15#include "notcurses/notcurses.h"
16#include "compat/compat.h"
17#include "logging.h"
18
19#ifdef __cplusplus
20extern "C" {
21#endif
22
23// an nccell only provides storage for up to 4 bytes of an EGC. if there's
24// anything more than that, it's spilled into the egcpool, and the nccell
25// records the offset. when an nccell is released, the egcpool memory it
26// owned is zeroed out, and made usable by another nccell.
27
28typedef struct egcpool {
29 char* pool; // ringbuffer of attached extension storage
30 int poolsize; // total number of bytes in pool
31 int poolused; // bytes actively used, grow when this gets too large
32 int poolwrite; // next place to *look for* a place to write
34
35#define POOL_MINIMUM_ALLOC BUFSIZ
36#define POOL_MAXIMUM_BYTES (1 << 24) // max 16MiB (assumes 32 bits)
37
38static inline void
39egcpool_init(egcpool* p){
40 memset(p, 0, sizeof(*p));
41}
42
43static inline int
44egcpool_grow(egcpool* pool, int len){
45 int newsize = pool->poolsize * 2;
46 if(newsize < pool->poolsize){
47 return -1; // pernicious overflow (see also POOL_MAXIMUM_BYTES check below)
48 }
49 if(newsize < POOL_MINIMUM_ALLOC){
50 newsize = POOL_MINIMUM_ALLOC;
51 }
52 while(len > newsize - pool->poolsize){ // ensure we make enough space
53 if(newsize * 2 < newsize){
54 return -1;
55 }
56 newsize *= 2;
57 }
58 if(newsize > POOL_MAXIMUM_BYTES){
59 return -1;
60 }
61 // nasty cast here because c++ source might include this header :/
62 char* tmp = (char*)realloc(pool->pool, newsize);
63 if(tmp == NULL){
64 return -1;
65 }
66 pool->pool = tmp;
67 memset(pool->pool + pool->poolsize, 0, newsize - pool->poolsize);
68 pool->poolsize = newsize;
69 return 0;
70}
71
72// get the expected length of the encoded codepoint from the first byte of a
73// utf-8 character. if the byte is illegal as a first byte, 1 is returned.
74// Table 3.1B, Legal UTF8 Byte Sequences, Corrigendum #1: UTF-8 Shortest Form.
75// subsequent ("continuation") bytes must start with the bit pattern 10.
76static inline size_t
77utf8_codepoint_length(unsigned char c){
78 if(c <= 0x7f){ // 0x000000...0x00007f
79 return 1;
80 }else if(c <= 0xc1){ // illegal continuation byte
81 return 1;
82 }else if(c <= 0xdf){ // 0x000080...0x0007ff
83 return 2;
84 }else if(c <= 0xef){ // 0x000800...0x00ffff
85 return 3;
86 }else if(c <= 0xf4){ // c <= 0xf4, 0x100000...0x10ffff
87 return 4;
88 }else{ // illegal first byte
89 return 1;
90 }
91}
92
93// Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
94// libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
95// columns to '*colcount'. Returns the number of bytes consumed, not including
96// any NUL terminator. Neither the number of bytes nor columns is necessarily
97// equal to the number of decoded code points. Such are the ways of Unicode.
98// uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
99// to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
100static inline int
101utf8_egc_len(const char* gcluster, int* colcount){
102 size_t ret = 0;
103 *colcount = 0;
104 int r;
105 mbstate_t mbt;
106 memset(&mbt, 0, sizeof(mbt));
107 wchar_t wc, prevw = 0;
108 bool injoin = false;
109 do{
110 r = mbrtowc(&wc, gcluster, MB_LEN_MAX, &mbt);
111 if(r < 0){
112 // FIXME probably ought escape this somehow
113 logerror("invalid UTF8: %s", gcluster);
114 return -1;
115 }
116 if(prevw && !injoin && uc_is_grapheme_break(prevw, wc)){
117 break; // starts a new EGC, exit and do not claim
118 }
119 int cols;
120 if(uc_is_property_variation_selector(wc)){ // ends EGC
121 ret += r;
122 break;
123 }else if(wc == L'\u200d' || injoin){ // ZWJ is iswcntrl, so check it first
124 injoin = true;
125 cols = 0;
126 }else{
127 cols = wcwidth(wc);
128 if(cols < 0){
129 injoin = false;
130 if(iswspace(wc)){ // newline or tab
131 *colcount = 1;
132 return ret + 1;
133 }
134 cols = 1;
135 if(iswcntrl(wc)){
136 logerror("prohibited or invalid unicode: 0x%08x", (unsigned)wc);
137 return -1;
138 }
139 }
140 }
141 if(*colcount == 0){
142 *colcount += cols;
143 }
144 ret += r;
145 gcluster += r;
146 if(!prevw){
147 prevw = wc;
148 }
149 }while(r);
150 // FIXME what if injoin is set? incomplete EGC!
151 return ret;
152}
153
154// if we're inserting a EGC of |len| bytes, ought we proactively realloc?
155static inline bool
156egcpool_alloc_justified(const egcpool* pool, int len){
157 const int poolfree = pool->poolsize - pool->poolused;
158 // proactively get more space if we have less than 10% free. this doesn't
159 // guarantee that we'll have enough space to insert the string -- we could
160 // theoretically have every 10th byte free, and be unable to write even a
161 // two-byte egc -- so we might have to allocate after an expensive search :/.
162 if(poolfree >= len && poolfree * 10 > pool->poolsize){
163 return false;
164 }
165 return true;
166}
167
168// stash away the provided UTF8, NUL-terminated grapheme cluster. the cluster
169// should not be less than 2 bytes (such a cluster should be directly stored in
170// the cell). returns -1 on error, and otherwise a non-negative offset. 'ulen'
171// must be the number of bytes to lift from egc (utf8_egc_len()).
172__attribute__ ((nonnull (1, 2))) static inline int
173egcpool_stash(egcpool* pool, const char* egc, size_t ulen){
174 int len = ulen + 1; // count the NUL terminator
175 if(len <= 2){ // should never be empty, nor a single byte + NUL
176 return -1;
177 }
178 // the first time through, we don't force a grow unless we expect ourselves
179 // to have too little space. once we've done a search, we do force the grow.
180 // we should thus never have more than two iterations of this loop.
181 bool searched = false;
182 // we might have to realloc our underlying pool. it is possible that this EGC
183 // is actually *in* that pool, in which case our pointer will be invalidated.
184 // to be safe, duplicate prior to a realloc, and free along all paths.
186 do{
187 if(egcpool_alloc_justified(pool, len) || searched){
188 if(!duplicated){
189 // cast (and avoidance of strndup) to facilitate c++ inclusions
190 if((duplicated = (char *)malloc(ulen + 1)) == NULL){
191 return -1;
192 }
193 memcpy(duplicated, egc, ulen);
194 duplicated[ulen] = '\0';
195 }
196 if(egcpool_grow(pool, len) && searched){
198 return -1;
199 }
200 egc = duplicated;
201 }
202 // we now look for a place to lay out this egc. we need |len| zeroes in a
203 // row. starting at pool->poolwrite, look for such a range of unused
204 // memory. if we find it, write it out, and update used count. if we come
205 // back to where we started, force a growth and try again.
206 int curpos = pool->poolwrite;
207//fprintf(stderr, "Stashing [%s] %d starting at %d\n", egc, len, curpos);
208 do{
209 if(curpos == pool->poolsize){
210 curpos = 0;
211 }
212 if(pool->pool[curpos]){ // can't write if there's stuff here
213 ++curpos;
214 }else if(curpos && pool->pool[curpos - 1]){ // don't kill someone's NUL
215 ++curpos;
216 }else if(pool->poolsize - curpos < len){ // can't wrap around
217 if(pool->poolwrite > curpos){
218 break;
219 }
220 curpos = 0; // can this skip pool->poolwrite?
221 }else{ // promising! let's see if there's enough space
222 int need = len;
223 size_t trial = curpos;
224 while(--need){
225 if(pool->pool[++trial]){ // alas, not enough space here
226 break;
227 }
228 }
229 if(need == 0){ // found a suitable space, copy it!
230 memcpy(pool->pool + curpos, egc, len - 1);
231 pool->pool[curpos + len - 1] = '\0';
232 pool->poolwrite = curpos + len;
233 pool->poolused += len;
235//fprintf(stderr, "Stashing AT %d\n", curpos);
236 return curpos;
237 }
238 if(pool->poolwrite > curpos && pool->poolwrite - (len - need) < curpos){
239 break;
240 }
241 curpos += len - need;
242 }
243 }while(curpos != pool->poolwrite);
244 }while( (searched = !searched) );
246 assert(false);
247 return -1; // should never get here
248}
249
250// remove the egc from the pool. start at offset, and zero out everything until
251// we find a zero (our own NUL terminator). remove that number of bytes from
252// the usedcount.
253static inline void
254egcpool_release(egcpool* pool, int offset){
255 size_t freed = 1; // account for free(d) NUL terminator
256 while(pool->pool[offset]){
257 pool->pool[offset] = '\0';
258 ++freed;
259 ++offset;
260 assert(offset < pool->poolsize);
261 }
262 pool->poolused -= freed;
263 // FIXME ought we update pool->poolwrite?
264}
265
266static inline void
267egcpool_dump(egcpool* pool){
268 free(pool->pool);
269 pool->pool = NULL;
270 pool->poolsize = 0;
271 pool->poolwrite = 0;
272 pool->poolused = 0;
273}
274
275// get the offset into the egcpool for this cell's EGC. returns meaningless and
276// unsafe results if called on a simple cell.
277static inline uint32_t
278cell_egc_idx(const nccell* c){
279 return (htole(c->gcluster) & 0x00fffffflu);
280}
281
282// Is the cell a spilled (more than 4 byte) UTF8 EGC?
283static inline bool
284cell_extended_p(const nccell* c){
285 return (htole(c->gcluster) & 0xff000000ul) == 0x01000000ul;
286}
287
288// Is the cell simple (a UTF8-encoded EGC of four bytes or fewer)?
289static inline bool
290cell_simple_p(const nccell* c){
291 return !cell_extended_p(c);
292}
293
294// only applies to complex cells, do not use on simple cells
295__attribute__ ((__returns_nonnull__)) static inline const char*
296egcpool_extended_gcluster(const egcpool* pool, const nccell* c) {
297 assert(cell_extended_p(c));
298 uint32_t idx = cell_egc_idx(c);
299 return pool->pool + idx;
300}
301
302// Duplicate the contents of EGCpool 'src' onto another, wiping out any prior
303// contents in 'dst'.
304static inline int
305egcpool_dup(egcpool* dst, const egcpool* src){
306 if(src->pool){
307 char* tmp;
308 if((tmp = (char*)realloc(dst->pool, src->poolsize)) == NULL){
309 return -1;
310 }
311 dst->pool = tmp;
312 memcpy(dst->pool, src->pool, src->poolsize);
313 }
314 dst->poolsize = src->poolsize;
315 dst->poolused = src->poolused;
316 dst->poolwrite = src->poolwrite;
317 return 0;
318}
319
320#ifdef __cplusplus
321}
322#endif
323
324#endif
#define POOL_MAXIMUM_BYTES
Definition egcpool.h:36
const char * egc
Definition egcpool.h:173
const char size_t ulen
Definition egcpool.h:173
assert(false)
const nccell * c
Definition egcpool.h:296
bool searched
Definition egcpool.h:181
uint32_t idx
Definition egcpool.h:298
free(duplicated)
char * duplicated
Definition egcpool.h:185
#define POOL_MINIMUM_ALLOC
Definition egcpool.h:35
__attribute__((nonnull(1, 2))) static inline int egcpool_stash(egcpool *pool
int r
Definition fbuf.h:226
#define logerror(fmt,...)
Definition logging.h:32
#define htole(x)
Definition ncport.h:36
API int API int const nccell unsigned len
Definition notcurses.h:2588
char * pool
Definition egcpool.h:29
int poolsize
Definition egcpool.h:30
int poolused
Definition egcpool.h:31
int poolwrite
Definition egcpool.h:32
uint32_t gcluster
Definition notcurses.h:693
return NULL
Definition termdesc.h:229