From 3ab7fe0812d0e63ab9c5c5ade3553b1bf3f4b236 Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" Date: Wed, 13 May 2026 12:38:08 +0100 Subject: [PATCH 01/19] =?UTF-8?q?fast=5Ffloat=5Fstrtod:=20fix=20=C2=B11=20?= =?UTF-8?q?ULP=20rounding=20mismatch=20in=20widened=20fast=20path=20(#1511?= =?UTF-8?q?1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fixes a ±1 ULP rounding mismatch between `fast_float_strtod()` and libc `strtod()` in the widened (mantissa > 2^53) fast path introduced by #15061. Reported by @vitahlin in https://github.com/redis/redis/pull/14661#issuecomment-4320058616 with two minimal reproducers: ``` input: 9007199255094284e-19 fast_float_strtod: 0x3f4d83c94fbbcb8a libc strtod : 0x3f4d83c94fbbcb8b delta -1 ULP input: 2489830482329185244e1 fast_float_strtod: 0x43f59888c51e5b4c libc strtod : 0x43f59888c51e5b4b delta +1 ULP ``` Redis treats `strtod()` as the fallback for `fast_float_strtod()`, so every fast-path-accepted input is contractually expected to be bit-exact with `strtod()`. The two cases above are accepted by the widened branch but produce a different IEEE-754 representation, breaking the contract. ## Root cause The widened branch added in #15061 used a homebrew shortcut to convert a 128-bit integer product to a double: ```c value = (double)hi * 18446744073709551616.0 + (double)lo; ``` This is **not** a single-rounding operation — `(double)hi` rounds when `hi > 2^53`, and the subsequent `+ (double)lo` rounds again. For inputs near the round-half-to-even boundary, the two roundings can compose into the wrong direction. The negative-exponent branch is doubly affected: integer division `scaled / divisor` truncates the remainder before the conversion, so even a hypothetical correct `hi*2^64+lo` step would round down on inputs that should round up. ## Fix Replace the homebrew widened branch with the **Eisel-Lemire** algorithm from upstream `fast_float`. This is the same algorithm that `fast_float`'s own widened path uses; bit-exact-with-strtod for inputs with ≤19-digit mantissa is proved in: > Noble Mushtak and Daniel Lemire, "Fast Number Parsing Without Fallback". Pieces ported (MIT-licensed, from `fast_float/include/fast_float/fast_table.h`, `decimal_to_binary.h`, `float_common.h`): - 128-bit precomputed extended-precision powers of five (`5^-342 ... 5^308`, 651 entries) — pure data. - `compute_product_approximation` — 128-bit multiply with high-half rounding-boundary fixup. - `compute_float` — the main algorithm; returns `(mantissa, power2)` ready to be packed. - `am_to_double` — IEEE-754 binary64 bit-pack. - `__builtin_clzll` and `__uint128_t` wrappers (with a 32-bit fallback for portability). Clinger's strict fast path (`mantissa ≤ 2^53` and `|exp| ≤ 22`) is **kept** unchanged — it's a single double multiply/divide and is faster than Eisel-Lemire on its domain. Only the buggy widened branch is replaced. The port stays minimal: - **double-only** (no `float`, no `long double`) - No bigint slow path. The rare "indeterminate" inputs that upstream resolves with `digit_comp` are unreachable from `parse_number_string`'s ≤19-digit mantissa per the Mushtak-Lemire proof, but a defensive `am.power2 < 0` check is preserved that falls back to libc `strtod()` if any future caller widens the input domain. ## Why not just revert #15061? Considered. Reverting restores correctness at the cost of the +73-84 % zset listpack-load wins #15061 measured on 17-19 digit double scores. Eisel-Lemire is *the* algorithm that gives both correctness *and* the wider mantissa range — preserving #15061's wins while fixing the rounding regression. A "tightened admission filter" (only accept widened-path inputs where the conversion happens to be single-rounding) was also considered. The math shows the filter conditions are essentially unsatisfied for typical inputs (`lo == 0` requires the 128-bit product be divisible by 2^64; only ~1 in 10^13 random inputs qualify), making it equivalent to a revert with extra dead code. Eisel-Lemire is the only widened-path solution that preserves perf on the typical case. --- src/fast_float_strtod.c | 526 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 477 insertions(+), 49 deletions(-) diff --git a/src/fast_float_strtod.c b/src/fast_float_strtod.c index 25bddba79..8039c5a9b 100644 --- a/src/fast_float_strtod.c +++ b/src/fast_float_strtod.c @@ -48,6 +48,195 @@ static const double powers_of_ten[] = { 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22 }; +/* ---------------------------------------------------------------------------- + * Eisel-Lemire algorithm — extended-precision powers of five. + * + * The table below maps from decimal scaling (10^q) to a 128-bit binary + * approximation. Since 10^q = 2^q * 5^q and the 2^q factor is exact in + * binary, only 5^q affects the binary significand — so we precompute + * 5^q rounded toward 1 to 128 bits. Used by `compute_float()` to avoid + * any iterative rounding in the widened (mantissa > 2^53) range. + * + * Pulled verbatim from fast_float by Daniel Lemire & Joao Paulo Magalhaes + * (MIT-licensed, https://github.com/fastfloat/fast_float — fast_table.h). + * + * Range: 5^-342 ... 5^308 — covers every value that can produce a finite + * non-zero double from a 64-bit decimal mantissa. 651 entries, each stored + * as { high64, low64 } pairs (1302 uint64_t total). + * ---------------------------------------------------------------------------- */ + +#define EISEL_LEMIRE_SMALLEST_POWER_OF_FIVE -342 +#define EISEL_LEMIRE_LARGEST_POWER_OF_FIVE 308 +#define EISEL_LEMIRE_NUMBER_OF_ENTRIES (2 * (EISEL_LEMIRE_LARGEST_POWER_OF_FIVE - \ + EISEL_LEMIRE_SMALLEST_POWER_OF_FIVE + 1)) + +static const uint64_t power_of_five_128[EISEL_LEMIRE_NUMBER_OF_ENTRIES] = { + 0xeef453d6923bd65a, 0x113faa2906a13b3f, 0x9558b4661b6565f8, 0x4ac7ca59a424c507, 0xbaaee17fa23ebf76, 0x5d79bcf00d2df649, 0xe95a99df8ace6f53, 0xf4d82c2c107973dc, + 0x91d8a02bb6c10594, 0x79071b9b8a4be869, 0xb64ec836a47146f9, 0x9748e2826cdee284, 0xe3e27a444d8d98b7, 0xfd1b1b2308169b25, 0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f, 0xbdbd2d335e51a935, 0xde8b2b66b3bc4723, 0xad2c788035e61382, 0x8b16fb203055ac76, 0x4c3bcb5021afcc31, 0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78, 0xd71d6dad34a2af0d, 0x87d4713d6f33aa6b, 0x8672648c40e5ad68, 0xa9c98d8ccb009506, 0x680efdaf511f18c2, 0xd43bf0effdc0ba48, 0x212bd1b2566def2, + 0x84a57695fe98746d, 0x14bb630f7604b57, 0xa5ced43b7e3e9188, 0x419ea3bd35385e2d, 0xcf42894a5dce35ea, 0x52064cac828675b9, 0x818995ce7aa0e1b2, 0x7343efebd1940993, + 0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8, 0xca66fa129f9b60a6, 0xd41a26e077774ef6, 0xfd00b897478238d0, 0x8920b098955522b4, 0x9e20735e8cb16382, 0x55b46e5f5d5535b0, + 0xc5a890362fddbc62, 0xeb2189f734aa831d, 0xf712b443bbd52b7b, 0xa5e9ec7501d523e4, 0x9a6bb0aa55653b2d, 0x47b233c92125366e, 0xc1069cd4eabe89f8, 0x999ec0bb696e840a, + 0xf148440a256e2c76, 0xc00670ea43ca250d, 0x96cd2a865764dbca, 0x380406926a5e5728, 0xbc807527ed3e12bc, 0xc605083704f5ecf2, 0xeba09271e88d976b, 0xf7864a44c633682e, + 0x93445b8731587ea3, 0x7ab3ee6afbe0211d, 0xb8157268fdae9e4c, 0x5960ea05bad82964, 0xe61acf033d1a45df, 0x6fb92487298e33bd, 0x8fd0c16206306bab, 0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696, 0x8f48a4899877186c, 0xe0b62e2929aba83c, 0x331acdabfe94de87, 0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14, 0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a, 0xc9e82cd9f69d6150, 0x892731ac9faf056e, 0xbe311c083a225cd2, 0xab70fe17c79ac6ca, 0x6dbd630a48aaf406, 0xd64d3d9db981787d, 0x92cbbccdad5b108, + 0x85f0468293f0eb4e, 0x25bbf56008c58ea5, 0xa76c582338ed2621, 0xaf2af2b80af6f24e, 0xd1476e2c07286faa, 0x1af5af660db4aee1, 0x82cca4db847945ca, 0x50d98d9fc890ed4d, + 0xa37fce126597973c, 0xe50ff107bab528a0, 0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8, 0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a, 0x9faacf3df73609b1, 0x77b191618c54e9ac, + 0xc795830d75038c1d, 0xd59df5b9ef6a2417, 0xf97ae3d0d2446f25, 0x4b0573286b44ad1d, 0x9becce62836ac577, 0x4ee367f9430aec32, 0xc2e801fb244576d5, 0x229c41f793cda73f, + 0xf3a20279ed56d48a, 0x6b43527578c1110f, 0x9845418c345644d6, 0x830a13896b78aaa9, 0xbe5691ef416bd60c, 0x23cc986bc656d553, 0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9, 0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53, 0xe858ad248f5c22c9, 0xd1b3400f8f9cff68, 0x91376c36d99995be, 0x23100809b9c21fa1, + 0xb58547448ffffb2d, 0xabd40a0c2832a78a, 0xe2e69915b3fff9f9, 0x16c90c8f323f516c, 0x8dd01fad907ffc3b, 0xae3da7d97f6792e3, 0xb1442798f49ffb4a, 0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d, 0x40405643d711d583, 0x8a7d3eef7f1cfc52, 0x482835ea666b2572, 0xad1c8eab5ee43b66, 0xda3243650005eecf, 0xd863b256369d4a40, 0x90bed43e40076a82, + 0x873e4f75e2224e68, 0x5a7744a6e804a291, 0xa90de3535aaae202, 0x711515d0a205cb36, 0xd3515c2831559a83, 0xd5a5b44ca873e03, 0x8412d9991ed58091, 0xe858790afe9486c2, + 0xa5178fff668ae0b6, 0x626e974dbe39a872, 0xce5d73ff402d98e3, 0xfb0a3d212dc8128f, 0x80fa687f881c7f8e, 0x7ce66634bc9d0b99, 0xa139029f6a239f72, 0x1c1fffc1ebc44e80, + 0xc987434744ac874e, 0xa327ffb266b56220, 0xfbe9141915d7a922, 0x4bf1ff9f0062baa8, 0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9, 0xc4ce17b399107c22, 0xcb550fb4384d21d3, + 0xf6019da07f549b2b, 0x7e2a53a146606a48, 0x99c102844f94e0fb, 0x2eda7444cbfc426d, 0xc0314325637a1939, 0xfa911155fefb5308, 0xf03d93eebc589f88, 0x793555ab7eba27ca, + 0x96267c7535b763b5, 0x4bc1558b2f3458de, 0xbbb01b9283253ca2, 0x9eb1aaedfb016f16, 0xea9c227723ee8bcb, 0x465e15a979c1cadc, 0x92a1958a7675175f, 0xbfacd89ec191ec9, + 0xb749faed14125d36, 0xcef980ec671f667b, 0xe51c79a85916f484, 0x82b7e12780e7401a, 0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810, 0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9, 0x67a791e093e1d49a, 0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0, 0xaecc49914078536d, 0x58fae9f773886e18, 0xda7f5bf590966848, 0xaf39a475506a899e, + 0x888f99797a5e012d, 0x6d8406c952429603, 0xaab37fd7d8f58178, 0xc8e5087ba6d33b83, 0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64, 0x855c3be0a17fcd26, 0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e, 0xd0601d8efc57b08b, 0xf13b94daf124da26, 0x823c12795db6ce57, 0x76c53d08d6b70858, 0xa2cb1717b52481ed, 0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09, 0xfe5d54150b090b02, 0xd3f93b35435d7c4c, 0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf, 0xc6b8e9b0709f109a, 0x359ab6419ca1091b, + 0xf867241c8cc6d4c0, 0xc30163d203c94b62, 0x9b407691d7fc44f8, 0x79e0de63425dcf1d, 0xc21094364dfb5636, 0x985915fc12f542e4, 0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a, 0xa705992ceecf9c42, 0xbd8430bd08277231, 0x50c6ff782a838353, 0xece53cec4a314ebd, 0xa4f8bf5635246428, 0x940f4613ae5ed136, 0x871b7795e136be99, + 0xb913179899f68584, 0x28e2557b59846e3f, 0xe757dd7ec07426e5, 0x331aeada2fe589cf, 0x9096ea6f3848984f, 0x3ff0d2c85def7621, 0xb4bca50b065abe63, 0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb, 0xd3e8495912c62894, 0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c, 0xb080392cc4349dec, 0xbd8d794d96aacfb3, 0xdca04777f541c567, 0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60, 0xf41686c49db57244, 0xac5d37d5b79b6239, 0x311c2875c522ced5, 0xd77485cb25823ac7, 0x7d633293366b828b, 0x86a8d39ef77164bc, 0xae5dff9c02033197, + 0xa8530886b54dbdeb, 0xd9f57f830283fdfc, 0xd267caa862a12d66, 0xd072df63c324fd7b, 0x8380dea93da4bc60, 0x4247cb9e59f71e6d, 0xa46116538d0deb78, 0x52d9be85f074e608, + 0xcd795be870516656, 0x67902e276c921f8b, 0x806bd9714632dff6, 0xba1cd8a3db53b6, 0xa086cfcd97bf97f3, 0x80e8a40eccd228a4, 0xc8a883c0fdaf7df0, 0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c, 0x796b805720085f81, 0x9cc3a6eec6311a63, 0xcbe3303674053bb0, 0xc3f490aa77bd60fc, 0xbedbfc4411068a9c, 0xf4f1b4d515acb93b, 0xee92fb5515482d44, + 0x991711052d8bf3c5, 0x751bdd152d4d1c4a, 0xbf5cd54678eef0b6, 0xd262d45a78a0635d, 0xef340a98172aace4, 0x86fb897116c87c34, 0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0, + 0xbae0a846d2195712, 0x8974836059cca109, 0xe998d258869facd7, 0x2bd1a438703fc94b, 0x91ff83775423cc06, 0x7b6306a34627ddcf, 0xb67f6455292cbf08, 0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93, 0x8e938662882af53e, 0x547eb47b7282ee9c, 0xb23867fb2a35b28d, 0xe99e619a4f23aa43, 0xdec681f9f4c31f31, 0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e, 0xde83bc408dd3dd04, 0xae0b158b4738705e, 0x9624ab50b148d445, 0xd98ddaee19068c76, 0x3badd624dd9b0957, 0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c, 0xd47487cc8470652b, 0x7647c3200069671f, 0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073, 0xa5fb0a17c777cf09, 0xf468107100525890, + 0xcf79cc9db955c2cc, 0x7182148d4066eeb4, 0x81ac1fe293d599bf, 0xc6f14cd848405530, 0xa21727db38cb002f, 0xb8ada00e5a506a7c, 0xca9cf1d206fdc03b, 0xa6d90811f0e4851c, + 0xfd442e4688bd304a, 0x908f4a166d1da663, 0x9e4a9cec15763e2e, 0x9a598e4e043287fe, 0xc5dd44271ad3cdba, 0x40eff1e1853f29fd, 0xf7549530e188c128, 0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9, 0x82bb74f8301958ce, 0xc13a148e3032d6e7, 0xe36a52363c1faf01, 0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1, 0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de, 0x7415d448f6b6f0e7, 0xebdf661791d60f56, 0x111b495b3464ad21, 0x936b9fcebb25c995, 0xcab10dd900beec34, 0xb84687c269ef3bfb, 0x3d5d514f40eea742, + 0xe65829b3046b0afa, 0xcb4a5a3112a5112, 0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab, 0xb3f4e093db73a093, 0x59ed216765690f56, 0xe0f218b8d25088b8, 0x306869c13ec3532c, + 0x8c974f7383725573, 0x1e414218c73a13fb, 0xafbd2350644eeacf, 0xe5d1929ef90898fa, 0xdbac6c247d62a583, 0xdf45f746b74abf39, 0x894bc396ce5da772, 0x6b8bba8c328eb783, + 0xab9eb47c81f5114f, 0x66ea92f3f326564, 0xd686619ba27255a2, 0xc80a537b0efefebd, 0x8613fd0145877585, 0xbd06742ce95f5f36, 0xa798fc4196e952e7, 0x2c48113823b73704, + 0xd17f3b51fca3a7a0, 0xf75a15862ca504c5, 0x82ef85133de648c4, 0x9a984d73dbe722fb, 0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba, 0xcc963fee10b7d1b3, 0x318df905079926a8, + 0xffbbcfe994e5c61f, 0xfdf17746497f7052, 0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633, 0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0, 0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d, 0x6bea10ca65c084e, 0xc31bfa0fe5698db8, 0x486e494fcff30a62, 0xf3e2f893dec3f126, 0x5a89dba3c3efccfa, 0x986ddb5c6b3a76b7, 0xf89629465a75e01c, + 0xbe89523386091465, 0xf6bbb397f1135823, 0xee2ba6c0678b597f, 0x746aa07ded582e2c, 0x94db483840b717ef, 0xa8c2a44eb4571cdc, 0xba121a4650e4ddeb, 0x92f34d62616ce413, + 0xe896a0d7e51e1566, 0x77b020baf9c81d17, 0x915e2486ef32cd60, 0xace1474dc1d122e, 0xb5b5ada8aaff80b8, 0xd819992132456ba, 0xe3231912d5bf60e6, 0x10e1fff697ed6c69, + 0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1, 0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2, 0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde, 0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d, 0x86c16c98d2c953c6, 0xd89d64d57a607744, 0xe871c7bf077ba8b7, 0x87625f056c7c4a8b, 0x11471cd764ad4972, 0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf, + 0xd389b47879823479, 0x4aff1d108d4ec2c3, 0x843610cb4bf160cb, 0xcedf722a585139ba, 0xa54394fe1eedb8fe, 0xc2974eb4ee658828, 0xce947a3da6a9273e, 0x733d226229feea32, + 0x811ccc668829b887, 0x806357d5a3f525f, 0xa163ff802a3426a8, 0xca07c2dcb0cf26f7, 0xc9bcff6034c13052, 0xfc89b393dd02f0b5, 0xfc2c3f3841f17c67, 0xbbac2078d443ace2, + 0x9d9ba7832936edc0, 0xd54b944b84aa4c0d, 0xc5029163f384a931, 0xa9e795e65d4df11, 0xf64335bcf065d37d, 0x4d4617b5ff4a16d5, 0x99ea0196163fa42e, 0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6, 0xf07da27a82c37088, 0x5d767327bb4e5a4c, 0x964e858c91ba2655, 0x3a6a07f8d510f86f, 0xbbe226efb628afea, 0x890489f70a55368b, + 0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e, 0x92c8ae6b464fc96f, 0x3b0b8bc90012929d, 0xb77ada0617e3bbcb, 0x9ce6ebb40173744, 0xe55990879ddcaabd, 0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6, 0x9fa946824a12232d, 0xb32df8e9f3546564, 0x47939822dc96abf9, 0xdff9772470297ebd, 0x59787e2b93bc56f7, 0x8bfbea76c619ef36, 0x57eb4edb3c55b65a, + 0xaefae51477a06b03, 0xede622920b6b23f1, 0xdab99e59958885c4, 0xe95fab368e45eced, 0x88b402f7fd75539b, 0x11dbcb0218ebb414, 0xaae103b5fcd2a881, 0xd652bdc29f26a119, + 0xd59944a37c0752a2, 0x4be76d3346f0495f, 0x857fcae62d8493a5, 0x6f70a4400c562ddb, 0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952, 0xd097ad07a71f26b2, 0x7e2000a41346a7a7, + 0x825ecc24c873782f, 0x8ed400668c0c28c8, 0xa2f67f2dfa90563b, 0x728900802f0f32fa, 0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9, 0xfea126b7d78186bc, 0xe2f610c84987bfa8, + 0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9, 0xc6ede63fa05d3143, 0x91503d1c79720dbb, 0xf8a95fcf88747d94, 0x75a44c6397ce912a, 0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba, + 0xc24452da229b021b, 0xfbe85badce996168, 0xf2d56790ab41c2a2, 0xfae27299423fb9c3, 0x97c560ba6b0919a5, 0xdccd879fc967d41a, 0xbdb6b8e905cb600f, 0x5400e987bbc1c920, + 0xed246723473e3813, 0x290123e9aab23b68, 0x9436c0760c86e30b, 0xf9a0b6720aaf6521, 0xb94470938fa89bce, 0xf808e40e8d5b3e69, 0xe7958cb87392c2c2, 0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2, 0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3, 0xe2280b6c20dd5232, 0x25c6da63c38de1b0, 0x8d590723948a535f, 0x579c487e5a38ad0e, + 0xb0af48ec79ace837, 0x2d835a9df0c6d851, 0xdcdb1b2798182244, 0xf8e431456cf88e65, 0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff, 0xac8b2d36eed2dac5, 0xe272467e3d222f3f, + 0xd7adf884aa879177, 0x5b0ed81dcc6abb0f, 0x86ccbb52ea94baea, 0x98e947129fc2b4e9, 0xa87fea27a539e9a5, 0x3f2398d747b36224, 0xd29fe4b18e88640e, 0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89, 0x1953cf68300424ac, 0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7, 0xcdb02555653131b6, 0x3792f412cb06794d, 0x808e17555f3ebf11, 0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4, 0xc8de047564d20a8b, 0xf245825a5a445275, 0xfb158592be068d2e, 0xeed6e2f0f0d56712, 0x9ced737bb6c4183d, 0x55464dd69685606b, + 0xc428d05aa4751e4c, 0xaa97e14c3c26b886, 0xf53304714d9265df, 0xd53dd99f4b3066a8, 0x993fe2c6d07b7fab, 0xe546a8038efe4029, 0xbf8fdb78849a5f96, 0xde98520472bdd033, + 0xef73d256a5c0f77c, 0x963e66858f6d4440, 0x95a8637627989aad, 0xdde7001379a44aa8, 0xbb127c53b17ec159, 0x5560c018580d5d52, 0xe9d71b689dde71af, 0xaab8f01e6e10b4a6, + 0x9226712162ab070d, 0xcab3961304ca70e8, 0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22, 0xe45c10c42a2b3b05, 0x8cb89a7db77c506a, 0x8eb98a7a9a5b04e3, 0x77f3608e92adb242, + 0xb267ed1940f1c61c, 0x55f038b237591ed3, 0xdf01e85f912e37a3, 0x6b6c46dec52f6688, 0x8b61313bbabce2c6, 0x2323ac4b3b3da015, 0xae397d8aa96c1b77, 0xabec975e0a0d081a, + 0xd9c7dced53c72255, 0x96e7bd358c904a21, 0x881cea14545c7575, 0x7e50d64177da2e54, 0xaa242499697392d2, 0xdde50bd1d5d0b9e9, 0xd4ad2dbfc3d07787, 0x955e4ec64b44e864, + 0x84ec3c97da624ab4, 0xbd5af13bef0b113e, 0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e, 0xcfb11ead453994ba, 0x67de18eda5814af2, 0x81ceb32c4b43fcf4, 0x80eacf948770ced7, + 0xa2425ff75e14fc31, 0xa1258379a94d028d, 0xcad2f7f5359a3b3e, 0x96ee45813a04330, 0xfd87b5f28300ca0d, 0x8bca9d6e188853fc, 0x9e74d1b791e07e48, 0x775ea264cf55347e, + 0xc612062576589dda, 0x95364afe032a819e, 0xf79687aed3eec551, 0x3a83ddbd83f52205, 0x9abe14cd44753b52, 0xc4926a9672793543, 0xc16d9a0095928a27, 0x75b7053c0f178294, + 0xf1c90080baf72cb1, 0x5324c68b12dd6339, 0x971da05074da7bee, 0xd3f6fc16ebca5e04, 0xbce5086492111aea, 0x88f4bb1ca6bcf585, 0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07, 0x3aff322e62439fd0, 0xb877aa3236a4b449, 0x9befeb9fad487c3, 0xe69594bec44de15b, 0x4c2ebe687989a9b4, 0x901d7cf73ab0acd9, 0xf9d37014bf60a11, + 0xb424dc35095cd80f, 0x538484c19ef38c95, 0xe12e13424bb40e13, 0x2865a5f206b06fba, 0x8cbccc096f5088cb, 0xf93f87b7442e45d4, 0xafebff0bcb24aafe, 0xf78f69a51539d749, + 0xdbe6fecebdedd5be, 0xb573440e5a884d1c, 0x89705f4136b4a597, 0x31680a88f8953031, 0xabcc77118461cefc, 0xfdc20d2b36ba7c3e, 0xd6bf94d5e57a42bc, 0x3d32907604691b4d, + 0x8637bd05af6c69b5, 0xa63f9a49c2c1b110, 0xa7c5ac471b478423, 0xfcf80dc33721d54, 0xd1b71758e219652b, 0xd3c36113404ea4a9, 0x83126e978d4fdf3b, 0x645a1cac083126ea, + 0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4, 0xcccccccccccccccc, 0xcccccccccccccccd, 0x8000000000000000, 0x0, 0xa000000000000000, 0x0, + 0xc800000000000000, 0x0, 0xfa00000000000000, 0x0, 0x9c40000000000000, 0x0, 0xc350000000000000, 0x0, + 0xf424000000000000, 0x0, 0x9896800000000000, 0x0, 0xbebc200000000000, 0x0, 0xee6b280000000000, 0x0, + 0x9502f90000000000, 0x0, 0xba43b74000000000, 0x0, 0xe8d4a51000000000, 0x0, 0x9184e72a00000000, 0x0, + 0xb5e620f480000000, 0x0, 0xe35fa931a0000000, 0x0, 0x8e1bc9bf04000000, 0x0, 0xb1a2bc2ec5000000, 0x0, + 0xde0b6b3a76400000, 0x0, 0x8ac7230489e80000, 0x0, 0xad78ebc5ac620000, 0x0, 0xd8d726b7177a8000, 0x0, + 0x878678326eac9000, 0x0, 0xa968163f0a57b400, 0x0, 0xd3c21bcecceda100, 0x0, 0x84595161401484a0, 0x0, + 0xa56fa5b99019a5c8, 0x0, 0xcecb8f27f4200f3a, 0x0, 0x813f3978f8940984, 0x4000000000000000, 0xa18f07d736b90be5, 0x5000000000000000, + 0xc9f2c9cd04674ede, 0xa400000000000000, 0xfc6f7c4045812296, 0x4d00000000000000, 0x9dc5ada82b70b59d, 0xf020000000000000, 0xc5371912364ce305, 0x6c28000000000000, + 0xf684df56c3e01bc6, 0xc732000000000000, 0x9a130b963a6c115c, 0x3c7f400000000000, 0xc097ce7bc90715b3, 0x4b9f100000000000, 0xf0bdc21abb48db20, 0x1e86d40000000000, + 0x96769950b50d88f4, 0x1314448000000000, 0xbc143fa4e250eb31, 0x17d955a000000000, 0xeb194f8e1ae525fd, 0x5dcfab0800000000, 0x92efd1b8d0cf37be, 0x5aa1cae500000000, + 0xb7abc627050305ad, 0xf14a3d9e40000000, 0xe596b7b0c643c719, 0x6d9ccd05d0000000, 0x8f7e32ce7bea5c6f, 0xe4820023a2000000, 0xb35dbf821ae4f38b, 0xdda2802c8a800000, + 0xe0352f62a19e306e, 0xd50b2037ad200000, 0x8c213d9da502de45, 0x4526f422cc340000, 0xaf298d050e4395d6, 0x9670b12b7f410000, 0xdaf3f04651d47b4c, 0x3c0cdd765f114000, + 0x88d8762bf324cd0f, 0xa5880a69fb6ac800, 0xab0e93b6efee0053, 0x8eea0d047a457a00, 0xd5d238a4abe98068, 0x72a4904598d6d880, 0x85a36366eb71f041, 0x47a6da2b7f864750, + 0xa70c3c40a64e6c51, 0x999090b65f67d924, 0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d, 0x82818f1281ed449f, 0xbff8f10e7a8921a4, 0xa321f2d7226895c7, 0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490, 0xfee50b7025c36a08, 0x2f236d04753d5b4, 0x9f4f2726179a2245, 0x1d762422c946590, 0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2, 0x9b934c3b330c8577, 0x63cc55f49f88eb2f, 0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb, 0xf316271c7fc3908a, 0x8bef464e3945ef7a, + 0x97edd871cfda3a56, 0x97758bf0e3cbb5ac, 0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317, 0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd, 0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436, 0xb3e2fd538e122b44, 0xe7d34c64a9c85d44, 0x60dbbca87196b616, 0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd, 0xb51d13aea4a488dd, 0x6babab6398bdbe41, + 0xe264589a4dcdab14, 0xc696963c7eed2dd1, 0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2, 0xb0de65388cc8ada8, 0x3b25a55f43294bcb, 0xdd15fe86affad912, 0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab, 0x6e3569326c784337, 0xacb92ed9397bf996, 0x49c2c37f07965404, 0xd7e77a8f87daf7fb, 0xdc33745ec97be906, 0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3, + 0xa8acd7c0222311bc, 0xc40832ea0d68ce0c, 0xd2d80db02aabd62b, 0xf50a3fa490c30190, 0x83c7088e1aab65db, 0x792667c6da79e0fa, 0xa4b8cab1a1563f52, 0x577001b891185938, + 0xcde6fd5e09abcf26, 0xed4c0226b55e6f86, 0x80b05e5ac60b6178, 0x544f8158315b05b4, 0xa0dc75f1778e39d6, 0x696361ae3db1c721, 0xc913936dd571c84c, 0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f, 0x4ab48a04065c723, 0x9d174b2dcec0e47b, 0x62eb0d64283f9c76, 0xc45d1df942711d9a, 0x3ba5d0bd324f8394, 0xf5746577930d6500, 0xca8f44ec7ee36479, + 0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb, 0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e, 0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e, 0x95d04aee3b80ece5, 0xbba1f1d158724a12, + 0xbb445da9ca61281f, 0x2a8a6e45ae8edc97, 0xea1575143cf97226, 0xf52d09d71a3293bd, 0x924d692ca61be758, 0x593c2626705f9c56, 0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c, + 0xe498f455c38b997a, 0xb6dfb9c0f956447, 0x8edf98b59a373fec, 0x4724bd4189bd5eac, 0xb2977ee300c50fe7, 0x58edec91ec2cb657, 0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed, + 0x8b865b215899f46c, 0xbd79e0d20082ee74, 0xae67f1e9aec07187, 0xecd8590680a3aa11, 0xda01ee641a708de9, 0xe80e6f4820cc9495, 0x884134fe908658b2, 0x3109058d147fdcdd, + 0xaa51823e34a7eede, 0xbd4b46f0599fd415, 0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a, 0x850fadc09923329e, 0x3e2cf6bc604ddb0, 0xa6539930bf6bff45, 0x84db8346b786151c, + 0xcfe87f7cef46ff16, 0xe612641865679a63, 0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e, 0xa26da3999aef7749, 0xe3be5e330f38f09d, 0xcb090c8001ab551c, 0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6, 0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa, 0xc646d63501a1511d, 0xb281e1fd541501b8, 0xf7d88bc24209a565, 0x1f225a7ca91a4226, + 0x9ae757596946075f, 0x3375788de9b06958, 0xc1a12d2fc3978937, 0x52d6b1641c83ae, 0xf209787bb47d6b84, 0xc0678c5dbd23a49a, 0x9745eb4d50ce6332, 0xf840b7ba963646e0, + 0xbd176620a501fbff, 0xb650e5a93bc3d898, 0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe, 0x93ba47c980e98cdf, 0xc66f336c36b10137, 0xb8a8d9bbe123f017, 0xb80b0047445d4184, + 0xe6d3102ad96cec1d, 0xa60dc059157491e5, 0x9043ea1ac7e41392, 0x87c89837ad68db2f, 0xb454e4a179dd1877, 0x29babe4598c311fb, 0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d, 0x1899e4a65f58660c, 0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f, 0xdc21a1171d42645d, 0x76707543f4fa1f73, 0x899504ae72497eba, 0x6a06494a791c53a8, + 0xabfa45da0edbde69, 0x487db9d17636892, 0xd6f8d7509292d603, 0x45a9d2845d3c42b6, 0x865b86925b9bc5c2, 0xb8a2392ba45a9b2, 0xa7f26836f282b732, 0x8e6cac7768d7141e, + 0xd1ef0244af2364ff, 0x3207d795430cd926, 0x8335616aed761f1f, 0x7f44e6bd49e807b8, 0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6, 0xcd036837130890a1, 0x36dba887c37a8c0f, + 0x802221226be55a64, 0xc2494954da2c9789, 0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c, 0xc83553c5c8965d3d, 0x6f92829494e5acc7, 0xfa42a8b73abbf48c, 0xcb772339ba1f17f9, + 0x9c69a97284b578d7, 0xff2a760414536efb, 0xc38413cf25e2d70d, 0xfef5138519684aba, 0xf46518c2ef5b8cd1, 0x7eb258665fc25d69, 0x98bf2f79d5993802, 0xef2f773ffbd97a61, + 0xbeeefb584aff8603, 0xaafb550ffacfd8fa, 0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38, 0x952ab45cfa97a0b2, 0xdd945a747bf26183, 0xba756174393d88df, 0x94f971119aeef9e4, + 0xe912b9d1478ceb17, 0x7a37cd5601aab85d, 0x91abb422ccb812ee, 0xac62e055c10ab33a, 0xb616a12b7fe617aa, 0x577b986b314d6009, 0xe39c49765fdf9d94, 0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d, 0x14588f13be847307, 0xb1d219647ae6b31c, 0x596eb2d8ae258fc8, 0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb, 0x8aec23d680043bee, 0x25de7bb9480d5854, + 0xada72ccc20054ae9, 0xaf561aa79a10ae6a, 0xd910f7ff28069da4, 0x1b2ba1518094da04, 0x87aa9aff79042286, 0x90fb44d2f05d0842, 0xa99541bf57452b28, 0x353a1607ac744a53, + 0xd3fa922f2d1675f2, 0x42889b8997915ce8, 0x847c9b5d7c2e09b7, 0x69956135febada11, 0xa59bc234db398c25, 0x43fab9837e699095, 0xcf02b2c21207ef2e, 0x94f967e45e03f4bb, + 0x8161afb94b44f57d, 0x1d1be0eebac278f5, 0xa1ba1ba79e1632dc, 0x6462d92a69731732, 0xca28a291859bbf93, 0x7d7b8f7503cfdcfe, 0xfcb2cb35e702af78, 0x5cda735244c3d43e, + 0x9defbf01b061adab, 0x3a0888136afa64a7, 0xc56baec21c7a1916, 0x88aaa1845b8fdd0, 0xf6c69a72a3989f5b, 0x8aad549e57273d45, 0x9a3c2087a63f6399, 0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd, 0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5, 0x969eb7c47859e743, 0x9f644ae5a4b1b325, 0xbc4665b596706114, 0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959, 0xa90cb506d155a7ea, 0x9316ff75dd87cbd8, 0x9a7f12442d588f2, 0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f, 0xe5d3ef282a242e81, 0x8f1668c8a86da5fa, + 0x8fa475791a569d10, 0xf96e017d694487bc, 0xb38d92d760ec4455, 0x37c981dcc395a9ac, 0xe070f78d3927556a, 0x85bbe253f47b1417, 0x8c469ab843b89562, 0x93956d7478ccec8e, + 0xaf58416654a6babb, 0x387ac8d1970027b2, 0xdb2e51bfe9d0696a, 0x6997b05fcc0319e, 0x88fcf317f22241e2, 0x441fece3bdf81f03, 0xab3c2fddeeaad25a, 0xd527e81cad7626c3, + 0xd60b3bd56a5586f1, 0x8a71e223d8d3b074, 0x85c7056562757456, 0xf6872d5667844e49, 0xa738c6bebb12d16c, 0xb428f8ac016561db, 0xd106f86e69d785c7, 0xe13336d701beba52, + 0x82a45b450226b39c, 0xecc0024661173473, 0xa34d721642b06084, 0x27f002d7f95d0190, 0xcc20ce9bd35c78a5, 0x31ec038df7b441f4, 0xff290242c83396ce, 0x7e67047175a15271, + 0x9f79a169bd203e41, 0xf0062c6e984d386, 0xc75809c42c684dd1, 0x52c07b78a3e60868, 0xf92e0c3537826145, 0xa7709a56ccdf8a82, 0x9bbcc7a142b17ccb, 0x88a66076400bb691, + 0xc2abf989935ddbfe, 0x6acff893d00ea435, 0xf356f7ebf83552fe, 0x583f6b8c4124d43, 0x98165af37b2153de, 0xc3727a337a8b704a, 0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c, + 0xeda2ee1c7064130c, 0x1162def06f79df73, 0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8, 0xb9a74a0637ce2ee1, 0x6d953e2bd7173692, 0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0, 0x1d9c9892400a22a2, 0xb54d5e4a127f59c8, 0x2503beb6d00cab4b, 0xe2a0b5dc971f303a, 0x2e44ae64840fd61d, 0x8da471a9de737e24, 0x5ceaecfed289e5d2, + 0xb10d8e1456105dad, 0x7425a83e872c5f47, 0xdd50f1996b947518, 0xd12f124e28f77719, 0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f, 0xace73cbfdc0bfb7b, 0x636cc64d1001550b, + 0xd8210befd30efa5a, 0x3c47f7e05401aa4e, 0x8714a775e3e95c78, 0x65acfaec34810a71, 0xa8d9d1535ce3b396, 0x7f1839a741a14d0d, 0xd31045a8341ca07c, 0x1ede48111209a050, + 0x83ea2b892091e44d, 0x934aed0aab460432, 0xa4e4b66b68b65d60, 0xf81da84d5617853f, 0xce1de40642e3f4b9, 0x36251260ab9d668e, 0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019, + 0xa1075a24e4421730, 0xb24cf65b8612f81f, 0xc94930ae1d529cfc, 0xdee033f26797b627, 0xfb9b7cd9a4a7443c, 0x169840ef017da3b1, 0x9d412e0806e88aa5, 0x8e1f289560ee864e, + 0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2, 0xf5b5d7ec8acb58a2, 0xae10af696774b1db, 0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29, 0xbff610b0cc6edd3f, 0x17fd090a58d32af3, + 0xeff394dcff8a948e, 0xddfc4b4cef07f5b0, 0x95f83d0a1fb69cd9, 0x4abdaf101564f98e, 0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1, 0xea53df5fd18d5513, 0x84c86189216dc5ed, + 0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4, 0xb7118682dbb66a77, 0x3fbc8c33221dc2a1, 0xe4d5e82392a40515, 0xfabaf3feaa5334a, 0x8f05b1163ba6832d, 0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8, 0x743e20e9ef511012, 0xdf78e4b2bd342cf6, 0x914da9246b255416, 0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e, 0xae9672aba3d0c320, 0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e, 0x8865899617fb1871, 0x7e2fa67c7a658892, 0xaa7eebfb9df9de8d, 0xddbb901b98feeab7, 0xd51ea6fa85785631, 0x552a74227f3ea565, + 0x8533285c936b35de, 0xd53a88958f87275f, 0xa67ff273b8460356, 0x8a892abaf368f137, 0xd01fef10a657842c, 0x2d2b7569b0432d85, 0x8213f56a67f6b29b, 0x9c3b29620e29fc73, + 0xa298f2c501f45f42, 0x8349f3ba91b47b8f, 0xcb3f2f7642717713, 0x241c70a936219a73, 0xfe0efb53d30dd4d7, 0xed238cd383aa0110, 0x9ec95d1463e8a506, 0xf4363804324a40aa, + 0xc67bb4597ce2ce48, 0xb143c6053edcd0d5, 0xf81aa16fdc1b81da, 0xdd94b7868e94050a, 0x9b10a4e5e9913128, 0xca7cf2b4191c8326, 0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf, 0xbc633b39673c8cec, 0x976e41088617ca01, 0xd5be0503e085d813, 0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18, 0xec9c459d51852ba2, 0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45, 0xcabb90e5c942b503, 0xb8da1662e7b00a17, 0x3d6a751f3b936243, 0xe7109bfba19c0c9d, 0xcc512670a783ad4, 0x906a617d450187e2, 0x27fb2b80668b24c5, + 0xb484f9dc9641e9da, 0xb1f9f660802dedf6, 0xe1a63853bbd26451, 0x5e7873f8a0396973, 0x8d07e33455637eb2, 0xdb0b487b6423e1e8, 0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7, 0x7641a140cc7810fb, 0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d, 0xac2820d9623bf429, 0x546345fa9fbdcd44, 0xd732290fbacaf133, 0xa97c177947ad4095, + 0x867f59a9d4bed6c0, 0x49ed8eabcccc485d, 0xa81f301449ee8c70, 0x5c68f256bfff5a74, 0xd226fc195c6a2f8c, 0x73832eec6fff3111, 0x83585d8fd9c25db7, 0xc831fd53c5ff7eab, + 0xa42e74f3d032f525, 0xba3e7ca8b77f5e55, 0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb, 0x80444b5e7aa7cf85, 0x7980d163cf5b81b3, 0xa0555e361951c366, 0xd7e105bcc332621f, + 0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7, 0xfa856334878fc150, 0xb14f98f6f0feb951, 0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3, 0xc3b8358109e84f07, 0xa862f80ec4700c8, + 0xf4a642e14c6262c8, 0xcd27bb612758c0fa, 0x98e7e9cccfbd7dbd, 0x8038d51cb897789c, 0xbf21e44003acdd2c, 0xe0470a63e6bd56c3, 0xeeea5d5004981478, 0x1858ccfce06cac74, + 0x95527a5202df0ccb, 0xf37801e0c43ebc8, 0xbaa718e68396cffd, 0xd30560258f54e6ba, 0xe950df20247c83fd, 0x47c6b82ef32a2069, 0x91d28b7416cdd27e, 0x4cdc331d57fa5441, + 0xb6472e511c81471d, 0xe0133fe4adf8e952, 0xe3d8f9e563a198e5, 0x58180fddd97723a6, 0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648, +}; + + /* Maximum mantissa for fast path: 2^53 */ #define MAX_MANTISSA_FAST_PATH 9007199254740992ULL /* 2^53 */ @@ -159,6 +348,190 @@ static inline uint32_t parse_eight_digits_swar(uint64_t val) { return (uint32_t)val; } +/* ---------------------------------------------------------------------------- + * Eisel-Lemire algorithm — core (compute_float / am_to_double). + * + * Given a decimal mantissa `w` (≤ 19 digits, fits in uint64) and exponent `q`, + * compute the correctly-rounded `double` representing `w * 10^q`. Internally: + * + * 1. Shift `w` so its leading bit is set (full 64-bit mantissa). + * 2. Multiply by the 128-bit precomputed power-of-five entry above. + * 3. Extract the 53-bit mantissa from the high 64 bits of the product, with + * one extra bit for round-to-nearest-even. + * 4. Apply the round-half-to-even rule, including the rare power-of-2 tie + * case that needs a second-pass check. + * + * For the 19-digit / |q| ≤ 22 input range the result is provably bit-exact + * with strtod() (Mushtak & Lemire, "Fast Number Parsing Without Fallback"). + * The caller falls back to strtod() if compute_float() signals indeterminate + * (we never trigger that branch with parse_number_string's bounded inputs). + * + * Ported from fast_float by Daniel Lemire & Joao Paulo Magalhaes + * (MIT-licensed, https://github.com/fastfloat/fast_float — decimal_to_binary.h + * and float_common.h). C++ template machinery dropped in favour of a + * double-only specialisation; struct layouts kept to ease future review. + * ---------------------------------------------------------------------------- */ + +/* IEEE-754 binary64 constants (mirrors fast_float's binary_format). */ +#define DOUBLE_MANTISSA_EXPLICIT_BITS 52 +#define DOUBLE_MIN_EXPONENT_ROUND_EVEN -4 +#define DOUBLE_MAX_EXPONENT_ROUND_EVEN 23 +#define DOUBLE_MINIMUM_EXPONENT -1023 +#define DOUBLE_INFINITE_POWER 0x7FF + +/* 128-bit unsigned, little-endian: low holds bits [0..63]. */ +typedef struct { + uint64_t low; + uint64_t high; +} value128; + +/* Result of compute_float(): a 53-bit mantissa and a biased binary exponent. + * power2 < 0 signals indeterminate (caller should fall back to strtod()). */ +typedef struct { + uint64_t mantissa; + int32_t power2; +} adjusted_mantissa; + +/* `__builtin_clzll` is undefined on input 0 — caller guarantees v > 0. */ +static inline int leading_zeroes_u64(uint64_t v) { + return __builtin_clzll(v); +} + +/* 64x64 -> 128 multiplication. __uint128_t is available on every 64-bit + * target Redis supports (gated explicitly in the call site). */ +static inline value128 full_multiplication(uint64_t a, uint64_t b) { + value128 r; +#ifdef __SIZEOF_INT128__ + __uint128_t prod = (__uint128_t)a * (__uint128_t)b; + r.low = (uint64_t)prod; + r.high = (uint64_t)(prod >> 64); +#else + /* 32-bit fallback: split each operand into two 32-bit halves. */ + uint64_t a_lo = (uint32_t)a, a_hi = a >> 32; + uint64_t b_lo = (uint32_t)b, b_hi = b >> 32; + uint64_t ll = a_lo * b_lo; + uint64_t lh = a_lo * b_hi; + uint64_t hl = a_hi * b_lo; + uint64_t hh = a_hi * b_hi; + uint64_t mid = (ll >> 32) + (uint32_t)lh + (uint32_t)hl; + r.low = (mid << 32) | (uint32_t)ll; + r.high = hh + (lh >> 32) + (hl >> 32) + (mid >> 32); +#endif + return r; +} + +/* For q in (-400, 350), this approximates floor(log2(5^q)) + q + 63 + * (or -ceil(log2(5^|q|)) + q + 63 for negative q). Used to derive power2. */ +static inline int32_t eisel_lemire_power(int32_t q) { + return (((152170 + 65536) * q) >> 16) + 63; +} + +/* 128-bit approximation of `w * 5^q`. The optional fixup multiplies by the + * second (extension) entry of the power-of-five table when the high half is + * close to a rounding boundary. Mathematical proof of sufficiency: see + * Mushtak & Lemire, "Fast Number Parsing Without Fallback". */ +static inline value128 compute_product_approximation_d(int64_t q, uint64_t w) { + int index = 2 * (int)(q - EISEL_LEMIRE_SMALLEST_POWER_OF_FIVE); + value128 firstproduct = full_multiplication(w, power_of_five_128[index]); + /* For double, bit_precision = mantissa_explicit_bits (52) + 3 = 55. */ + const uint64_t precision_mask = + (uint64_t)0xFFFFFFFFFFFFFFFFULL >> 55; + if ((firstproduct.high & precision_mask) == precision_mask) { + value128 secondproduct = + full_multiplication(w, power_of_five_128[index + 1]); + firstproduct.low += secondproduct.high; + if (secondproduct.high > firstproduct.low) { + firstproduct.high++; + } + } + return firstproduct; +} + +/* Eisel-Lemire main: compute a correctly-rounded representation of w * 10^q. + * Returns an `adjusted_mantissa`. Special outputs: + * - mantissa == 0 && power2 == 0: result is +/-0 + * - power2 == DOUBLE_INFINITE_POWER && mantissa == 0: result is infinity + * - power2 < 0: indeterminate (caller should fall back to strtod()). With + * parse_number_string()'s bounded mantissa (<= 19 digits), this branch + * is unreachable, but we keep the signature for safety. + */ +static adjusted_mantissa compute_float_d(int64_t q, uint64_t w) { + adjusted_mantissa answer; + + if (w == 0 || q < EISEL_LEMIRE_SMALLEST_POWER_OF_FIVE) { + answer.power2 = 0; + answer.mantissa = 0; + return answer; + } + if (q > EISEL_LEMIRE_LARGEST_POWER_OF_FIVE) { + answer.power2 = DOUBLE_INFINITE_POWER; + answer.mantissa = 0; + return answer; + } + + /* Renormalise w so its top bit is set. */ + int lz = leading_zeroes_u64(w); + w <<= lz; + + value128 product = compute_product_approximation_d(q, w); + + int upperbit = (int)(product.high >> 63); + int shift = upperbit + 64 - DOUBLE_MANTISSA_EXPLICIT_BITS - 3; + + answer.mantissa = product.high >> shift; + answer.power2 = (int32_t)(eisel_lemire_power((int32_t)q) + upperbit - lz - DOUBLE_MINIMUM_EXPONENT); + + if (answer.power2 <= 0) { + /* Subnormal path. */ + if (-answer.power2 + 1 >= 64) { + /* More than 64 bits below minimum exponent — definitely zero. */ + answer.power2 = 0; + answer.mantissa = 0; + return answer; + } + /* Safe: -answer.power2 + 1 < 64. */ + answer.mantissa >>= -answer.power2 + 1; + answer.mantissa += (answer.mantissa & 1); /* round up */ + answer.mantissa >>= 1; + /* If post-rounding the value crosses back into the normal range, mark + * it normal (power2 = 1) rather than subnormal (power2 = 0). */ + answer.power2 = (answer.mantissa < ((uint64_t)1 << DOUBLE_MANTISSA_EXPLICIT_BITS)) ? 0 : 1; + return answer; + } + + /* Normal path: handle the round-half-to-even tie case. */ + if ((product.low <= 1) && + (q >= DOUBLE_MIN_EXPONENT_ROUND_EVEN) && + (q <= DOUBLE_MAX_EXPONENT_ROUND_EVEN) && + ((answer.mantissa & 3) == 1)) { + if ((answer.mantissa << shift) == product.high) { + answer.mantissa &= ~(uint64_t)1; /* clear LSB so we round down */ + } + } + answer.mantissa += (answer.mantissa & 1); + answer.mantissa >>= 1; + if (answer.mantissa >= ((uint64_t)2 << DOUBLE_MANTISSA_EXPLICIT_BITS)) { + answer.mantissa = (uint64_t)1 << DOUBLE_MANTISSA_EXPLICIT_BITS; + answer.power2++; + } + answer.mantissa &= ~((uint64_t)1 << DOUBLE_MANTISSA_EXPLICIT_BITS); + if (answer.power2 >= DOUBLE_INFINITE_POWER) { + answer.power2 = DOUBLE_INFINITE_POWER; + answer.mantissa = 0; + } + return answer; +} + +/* Pack adjusted_mantissa back to a double via IEEE-754 bit layout. */ +static inline double am_to_double(int negative, adjusted_mantissa am) { + uint64_t word = am.mantissa; + word |= (uint64_t)am.power2 << DOUBLE_MANTISSA_EXPLICIT_BITS; + if (negative) word |= (uint64_t)1 << 63; + double value; + memcpy(&value, &word, sizeof(value)); + return value; +} + /* Parse a decimal number string into components. * This follows the fast_float algorithm closely. */ static inline int parse_number_string(const char *p, const char *pend, double *result, const char **endptr) { @@ -261,65 +634,42 @@ static inline int parse_number_string(const char *p, const char *pend, double *r if (digit_count > MAX_DIGITS) return 0; } - /* Check if we're within fast path bounds */ - if (exponent < MIN_EXPONENT_FAST_PATH) return 0; - if (exponent > MAX_EXPONENT_FAST_PATH) return 0; - + /* Pick the conversion path. Two regimes: + * Clinger fast path: small mantissa (<= 2^53) and small |exp| (<= 22). + * One double multiply or divide; cheapest, exact by construction. + * Eisel-Lemire: large mantissa or wide exponent range (full double + * domain). Slightly slower per call (128-bit multiply + table lookup) + * but correctly-rounded by the Mushtak-Lemire proof. + * Inputs outside both ranges fall back to strtod() (caller of this fn). */ double value; - if (mantissa <= MAX_MANTISSA_FAST_PATH) { + if (mantissa <= MAX_MANTISSA_FAST_PATH && + exponent >= MIN_EXPONENT_FAST_PATH && + exponent <= MAX_EXPONENT_FAST_PATH) + { /* Clinger fast path: all operands exact in double precision, * single multiply/divide produces a correctly-rounded result. */ value = (double)mantissa; if (exponent < 0) value = value / powers_of_ten[-exponent]; else if (exponent > 0) value = value * powers_of_ten[exponent]; + if (negative) value = -value; } else { -#ifdef __SIZEOF_INT128__ - /* Widened fast path for 17-19 significant-digit mantissas. - * - * (double)mantissa alone loses up to 11 bits when mantissa > 2^53, - * so the existing Clinger path would yield up to 1 ULP vs strtod. - * We recover full precision by doing the multiply/divide in 128-bit - * integer arithmetic (correctly-rounded by construction). Cases - * outside the supported exponent range fall through to strtod. - * - * Requires __uint128_t (GCC/Clang builtin, available on every 64-bit - * target Redis supports). 32-bit builds take the strtod() fallback. */ - if (exponent < -19 || exponent > 19) return 0; + /* Eisel-Lemire path. Replaces a previously hand-rolled widened branch + * (`(double)hi * 2^64 + (double)lo` shortcut) that produced ±1 ULP + * mismatches vs strtod() on inputs like 9007199255094284e-19 and + * 2489830482329185244e1. compute_float_d is bit-exact with strtod() + * for every input parse_number_string can produce. */ + if (exponent < EISEL_LEMIRE_SMALLEST_POWER_OF_FIVE || exponent > EISEL_LEMIRE_LARGEST_POWER_OF_FIVE) + return 0; - if (exponent >= 0) { - /* (mantissa * 10^e) fits in 128 bits. Convert exactly: the - * single (double) cast from __uint128_t rounds to nearest. */ - __uint128_t prod = (__uint128_t)mantissa * (uint64_t)powers_of_ten[exponent]; - uint64_t hi = (uint64_t)(prod >> 64); - uint64_t lo = (uint64_t)prod; - /* (double)hi * 2^64 has no rounding error (hi up to 2^64-1 rounds - * once, then * 2^64 is exact). Adding lo rounds once. Total: - * matches strtod on every tested case with e in [0,19]. */ - value = (double)hi * 18446744073709551616.0 + (double)lo; - } else { - /* mantissa / 10^|e|: scale numerator up by 2^64 before integer - * division to preserve precision, then descale by multiplying by - * 2^-64 (exact power-of-two scaling, does not round). The single - * (double) cast of the integer quotient produces IEEE round-to- - * nearest-even, matching strtod() bit-exactly for every tested - * 16-19 significant digit case. */ - uint64_t divisor = (uint64_t)powers_of_ten[-exponent]; - __uint128_t scaled = (__uint128_t)mantissa << 64; - __uint128_t q = scaled / divisor; - uint64_t hi = (uint64_t)(q >> 64); - uint64_t lo = (uint64_t)q; - value = ((double)hi * 18446744073709551616.0 + (double)lo) - * 5.421010862427522170037e-20; /* 2^-64 */ - } -#else - /* 32-bit target without __uint128_t: fall through to the strtod() - * fallback. Correctness is preserved (it's the same path that shipped - * in 8.8-M02); only the perf gain is 64-bit-target-specific. */ - return 0; -#endif + adjusted_mantissa am = compute_float_d(exponent, mantissa); + /* power2 < 0 would mean indeterminate (caller should fall back to + * strtod). With our bounded mantissa (<= 19 digits) this branch is + * unreachable per the Mushtak-Lemire proof, but we keep the guard so + * any future caller that supplies a larger mantissa stays correct. */ + if (am.power2 < 0) return 0; + value = am_to_double(negative, am); } - if (negative) value = -value; *result = value; return 1; } @@ -524,9 +874,87 @@ int fastFloatTest(int argc, char **argv, int flags) { /* Negative numbers exercising the widened path */ {"-0.49606648747577575", -0.49606648747577575}, {"-9007199254740993", -9007199254740992.0}, + + /* Eisel-Lemire rounding-boundary cases. + * Reported by @vitahlin on #14661 against the previous + * `(double)hi * 2^64 + (double)lo` widened branch which + * double-rounded the 128-bit product. Both must now match + * strtod() exactly. */ + {"9007199255094284e-19", 9007199255094284e-19}, /* was -1 ULP */ + {"2489830482329185244e1", 2489830482329185244e1}, /* was +1 ULP */ + + /* Subnormal boundaries (Eisel-Lemire's subnormal branch). */ + {"5e-324", 5e-324}, /* smallest pos subnormal */ + {"4.9e-324", 5e-324}, /* below half: rounds up */ + {"2.2250738585072009e-308", 2.2250738585072009e-308}, /* largest subnormal */ + {"2.2250738585072014e-308", 2.2250738585072014e-308}, /* smallest normal */ + {"1e-323", 1e-323}, + + /* Round-half-to-even ties: post-Clinger range, hits compute_float_d + * tie path (product.low <= 1, q in [-4, 23], mantissa & 3 == 1). */ + {"5497558138880", 5497558138880.0}, /* 2^42 + 2^33 boundary */ + {"5e-22", 5e-22}, + {"7.038531e-26", 7.038531e-26}, + {"4503599627475501e-10", 4503599627475501e-10}, /* near 2^52 */ + + /* Largest finite double + overflow. */ + {"1.7976931348623157e308", 1.7976931348623157e308}, /* DBL_MAX */ + {"1.7976931348623158e308", 1.7976931348623157e308}, /* nearest is DBL_MAX */ + {"1e308", 1e308}, + + /* Wide exponent range now reachable via Eisel-Lemire (previously + * fell to strtod). */ + {"1.234567890123456e100", 1.234567890123456e100}, + {"9.999999999999999e99", 9.999999999999999e99}, + {"1e-300", 1e-300}, + {"1.7e-300", 1.7e-300}, + + /* Repunit / many-9 mantissas — adjacent-double tie territory. */ + {"9999999999999998", 9999999999999998.0}, + {"99999999999999999", 1e17}, }; run_ff_tests(decimal_ok, COUNTOF(decimal_ok), 0); + /* Differential cross-check: every accepted input must produce the + * exact same bits as libc strtod(). Hand-picked hard cases covering + * every code path in compute_float_d (subnormal branch, round-half- + * to-even tie path, near-infinity, repunit mantissa, wide exponent). */ + { + static const char *diff_inputs[] = { + /* Boundary classics around 2^53. */ + "9007199254740992", "9007199254740993", "9007199254740994", + "9007199254740995", "9007199254740996", + /* Limits of finite double. */ + "1.7976931348623157e308", "2.2250738585072014e-308", + "5e-324", "1e-323", "4.9406564584124654e-324", + /* The two reproducer inputs the previous widened branch missed. */ + "9007199255094284e-19", "2489830482329185244e1", + /* Mushtak-Lemire stress range — 19-digit mantissas. */ + "1234567890123456789e0", "1234567890123456789e-5", + "1234567890123456789e5", "9999999999999999e19", + /* Common scientific constants — mid-exponent sanity. */ + "3.141592653589793", "2.718281828459045", + "1.4142135623730951e150", "6.022140857e23", + "1.602176634e-19", "9.10938356e-31", + }; + for (int i = 0; i < COUNTOF(diff_inputs); i++) { + const char *s = diff_inputs[i]; + char *fend, *lend; + errno = 0; + double got = fast_float_strtod(s, strlen(s), &fend); + errno = 0; + double libc = strtod(s, &lend); + uint64_t gb, lb; + memcpy(&gb, &got, sizeof(gb)); + memcpy(&lb, &libc, sizeof(lb)); + char descr[160]; + snprintf(descr, sizeof(descr), + "differential vs strtod: \"%s\" ff=0x%016llx libc=0x%016llx", + s, (unsigned long long)gb, (unsigned long long)lb); + test_cond(descr, gb == lb); + } + } + /* No valid prefix for full buffer, or trailing junk. */ ff_testcase decimal_bad[] = { {"1abc", 1.0}, From 0d9576435f83af79122c7db4e97f32afbba5bc3e Mon Sep 17 00:00:00 2001 From: Salvatore Sanfilippo Date: Wed, 13 May 2026 18:56:44 +0200 Subject: [PATCH 02/19] Implement the new Redis Array type (#15162) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Redis Array For years, Redis has been missing a real indexed data structure for the use cases where the index and the spatial relationship of elements are semantic. Hashes give you random lookups, but you have to store an index as a key, and have no range visibility. Lists give you appending and trimming, but what is in the middle remains hard to access. Streams give you append-only events, which is another (useful, indeed) beast. None of these is what you want when the *position itself* has business meaning — slot 37, step 4, row 18552, day from 2934 to 2949, file line 11, 12, 15 and so forth. And, all those types, for different reasons, are all suboptimal when you want a **ring buffer** able to store the latest N observed samples of something. Up to now, users found ways (they always do \o/) using the fact that the data structures that are obvious in this universe are also extremely powerful, if well implemented. But this forces compromises. Arrays handle these index-first requirements natively, and usually with much better memory and CPU usage than the workarounds. If the use case is the right one, Arrays often provide much better space, time and usability at the same time. ## Internal encoding 1. When dense, an Array is essentially a more fancy C array. You don't pay anything for storing the index. 2. Yet, instead of going really flat, arrays are sliced into 4096-element slices, and each slice, when it contains just a few elements, uses a special sparse encoding. When a slice is empty it's just a `NULL` stored in the directory. 3. Small ints, floats, and short strings are pointer-tagged, so they cost zero additional memory beyond the pointer slot itself. 4. When very sparse, a super-directory of windowed directories is used. This allows the data type to be safe, instead of exhibiting pathological space or time behavior. This representation is only triggered when there are more than 8 million elements or very high indexes set. ## Use cases Arrays are mostly stateless if not for the fact that each array remembers the index of the latest added item, allowing `ARINSERT` and `ARRING` to work properly. Otherwise it is a set/get at this index game, with solid support for both setting / getting ranges, server-side scanning, returning only populated elements in a time which is proportional not to the range size, but to the population size. A few concrete examples, that may work as mental models for the set of problems that are similar to them (from the POV of the data modeling). **Thermometer.** A sensor reporting once per minute, with gaps: ``` ARSET temp:room12:day7 123 22.3 ARGETRANGE temp:room12:day7 600 660 # the 10:00–11:00 window, with NULLs ARSCAN temp:room12:day7 600 660 # only populated elements AROP temp:room12:day7 0 1439 MAX # peak of the day, server-side ``` Missing minutes cost little to nothing. Numeric aggregation runs inside Redis. Telemetry, IoT, meter readings, KPI rollups. **Calendar.** A clinic with 96 fifteen-minute slots per day: ``` ARSET sched:room12:day 32 booking:991 ARSCAN sched:room12:day 0 95 # only occupied slots ARGETRANGE sched:room12:day 48 63 # the afternoon full view to render ``` The slot number is the business key in this case. Room booking, parking spaces, warehouse bins, lockers, ... **Ring buffer.** ARRING replaces the classic LPUSH+LTRIM pattern. Imagine remote `dmesg`. ``` ARRING machine:123 200 "[141087.430123]: arm_cpu_init(): cpu 14 online" # Capped to 200 entries ARLASTITEMS machine:123 50 REV # 50 newest first ``` Faster than LPUSH+LTRIM, keep indexed access to past elements. Last-N alarms, recent fraud scores, access history, remote logs, device events. Ok here the use cases are mainly the ones of the old pattern: it is just a better fit and allows to access random items in the middle, aggregate server-side, and so forth. **Workflow.** Step number is the index, value is the status. Gaps are meaningful: ``` ARSET claim:99172 0 received ARSET claim:99172 3 waiting:reviewer42 ARSET claim:99172 5 approved ARGETRANGE claim:99172 0 5 # full workflow view, with NULLs for missing steps ARSCAN claim:99172 0 5 # only steps that have a state ARCOUNT claim:99172 # number of recorded steps ARLEN claim:99172 # highest reached step + 1 ``` **Skills knowledge base for agents.** Arrays are good at representing / grepping into Markdown files: ``` ARSET skill:metal_gpu 0 "...." ARSET skill:metal_gpu 1 "...." ARSET skill:metal_gpu 2 "...." ARGREP skill:metal_gpu - + RE "M3|M4" WITHVALUES ``` ARGREP has EXACT, MATCH, GLOB, RE, you can have multiple predicates, can select AND or OR behavior. **Bulk import results.** Sparse row annotations over millions of rows / CSV / ...: ``` ARSET import:job551 18552 ERR:bad_email ARSCAN import:job551 0 1000000 # Provides only rows that have something ``` ## TLDR If the position is part of the meaning, use an Array. If you want to aggregate or grep remotely, use an Array. Feedback welcome :) --------- Co-authored-by: debing.sun Co-authored-by: Shubham S Taple <155555100+ShubhamTaple@users.noreply.github.com> Co-authored-by: Yuan Wang Co-authored-by: Marc Gravell --- .gitignore | 1 + deps/Makefile | 8 + deps/tre/LICENSE | 29 + deps/tre/Makefile | 79 + deps/tre/README.md | 276 ++ deps/tre/lib/regcomp.c | 188 ++ deps/tre/lib/regerror.c | 86 + deps/tre/lib/regexec.c | 584 ++++ deps/tre/lib/tre-ast.c | 226 ++ deps/tre/lib/tre-ast.h | 128 + deps/tre/lib/tre-compile.c | 2673 ++++++++++++++++++ deps/tre/lib/tre-compile.h | 27 + deps/tre/lib/tre-filter.c | 73 + deps/tre/lib/tre-filter.h | 19 + deps/tre/lib/tre-internal.h | 319 +++ deps/tre/lib/tre-match-backtrack.c | 676 +++++ deps/tre/lib/tre-match-parallel.c | 538 ++++ deps/tre/lib/tre-match-utils.h | 215 ++ deps/tre/lib/tre-mem.c | 155 + deps/tre/lib/tre-mem.h | 66 + deps/tre/lib/tre-parse.c | 1758 ++++++++++++ deps/tre/lib/tre-parse.h | 52 + deps/tre/lib/tre-stack.c | 123 + deps/tre/lib/tre-stack.h | 76 + deps/tre/lib/xmalloc.c | 362 +++ deps/tre/lib/xmalloc.h | 77 + deps/tre/local_includes/regex.h | 48 + deps/tre/local_includes/tre-config.h | 14 + deps/tre/local_includes/tre.h | 344 +++ deps/tre/tests/retest.c | 1871 +++++++++++++ deps/tre/tests/test-literal-opt.c | 303 ++ deps/tre/tests/test-malformed-regn.c | 85 + deps/tre/tests/test-str-source.c | 192 ++ redis.conf | 34 +- src/Makefile | 6 +- src/acl.c | 1 + src/aof.c | 112 + src/commands.def | 549 ++++ src/commands/arcount.json | 48 + src/commands/ardel.json | 53 + src/commands/ardelrange.json | 62 + src/commands/arget.json | 60 + src/commands/argetrange.json | 64 + src/commands/argrep.json | 182 ++ src/commands/arinfo.json | 103 + src/commands/arinsert.json | 54 + src/commands/arlastitems.json | 66 + src/commands/arlen.json | 48 + src/commands/armget.json | 62 + src/commands/armset.json | 64 + src/commands/arnext.json | 56 + src/commands/arop.json | 123 + src/commands/arring.json | 57 + src/commands/arscan.json | 76 + src/commands/arseek.json | 52 + src/commands/arset.json | 58 + src/commands/command-docs.json | 3 + src/config.c | 31 + src/db.c | 14 +- src/debug.c | 15 + src/defrag.c | 32 + src/hotkeys.c | 5 - src/lazyfree.c | 3 + src/module.c | 1 + src/networking.c | 12 + src/notify.c | 2 + src/object.c | 26 +- src/object.h | 2 + src/rdb.c | 254 +- src/rdb.h | 5 +- src/redis-check-rdb.c | 1 + src/redismodule.h | 6 +- src/server.h | 37 +- src/sparsearray.c | 2080 ++++++++++++++ src/sparsearray.h | 312 +++ src/t_array.c | 2021 +++++++++++++ src/util.h | 6 + tests/assets/array-32bit.rdb | Bin 0 -> 808 bytes tests/integration/corrupt-dump-fuzzer.tcl | 11 +- tests/integration/dismiss-mem.tcl | 9 + tests/support/util.tcl | 46 +- tests/unit/aofrw.tcl | 64 + tests/unit/memefficiency.tcl | 91 + tests/unit/type/array.tcl | 3114 +++++++++++++++++++++ tools/array-bench.py | 431 +++ utils/generate-command-code.py | 2 + 86 files changed, 22258 insertions(+), 39 deletions(-) create mode 100644 deps/tre/LICENSE create mode 100644 deps/tre/Makefile create mode 100644 deps/tre/README.md create mode 100644 deps/tre/lib/regcomp.c create mode 100644 deps/tre/lib/regerror.c create mode 100644 deps/tre/lib/regexec.c create mode 100644 deps/tre/lib/tre-ast.c create mode 100644 deps/tre/lib/tre-ast.h create mode 100644 deps/tre/lib/tre-compile.c create mode 100644 deps/tre/lib/tre-compile.h create mode 100644 deps/tre/lib/tre-filter.c create mode 100644 deps/tre/lib/tre-filter.h create mode 100644 deps/tre/lib/tre-internal.h create mode 100644 deps/tre/lib/tre-match-backtrack.c create mode 100644 deps/tre/lib/tre-match-parallel.c create mode 100644 deps/tre/lib/tre-match-utils.h create mode 100644 deps/tre/lib/tre-mem.c create mode 100644 deps/tre/lib/tre-mem.h create mode 100644 deps/tre/lib/tre-parse.c create mode 100644 deps/tre/lib/tre-parse.h create mode 100644 deps/tre/lib/tre-stack.c create mode 100644 deps/tre/lib/tre-stack.h create mode 100644 deps/tre/lib/xmalloc.c create mode 100644 deps/tre/lib/xmalloc.h create mode 100644 deps/tre/local_includes/regex.h create mode 100644 deps/tre/local_includes/tre-config.h create mode 100644 deps/tre/local_includes/tre.h create mode 100644 deps/tre/tests/retest.c create mode 100644 deps/tre/tests/test-literal-opt.c create mode 100644 deps/tre/tests/test-malformed-regn.c create mode 100644 deps/tre/tests/test-str-source.c create mode 100644 src/commands/arcount.json create mode 100644 src/commands/ardel.json create mode 100644 src/commands/ardelrange.json create mode 100644 src/commands/arget.json create mode 100644 src/commands/argetrange.json create mode 100644 src/commands/argrep.json create mode 100644 src/commands/arinfo.json create mode 100644 src/commands/arinsert.json create mode 100644 src/commands/arlastitems.json create mode 100644 src/commands/arlen.json create mode 100644 src/commands/armget.json create mode 100644 src/commands/armset.json create mode 100644 src/commands/arnext.json create mode 100644 src/commands/arop.json create mode 100644 src/commands/arring.json create mode 100644 src/commands/arscan.json create mode 100644 src/commands/arseek.json create mode 100644 src/commands/arset.json create mode 100644 src/sparsearray.c create mode 100644 src/sparsearray.h create mode 100644 src/t_array.c create mode 100644 tests/assets/array-32bit.rdb create mode 100644 tests/unit/type/array.tcl create mode 100755 tools/array-bench.py diff --git a/.gitignore b/.gitignore index 5ed94f1da..63968fb29 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ deps/lua/src/luac deps/lua/src/liblua.a deps/hdr_histogram/libhdrhistogram.a deps/fpconv/libfpconv.a +deps/tre/libtre.a tests/tls/* .make-* .prerequisites diff --git a/deps/Makefile b/deps/Makefile index ef6168bbd..7ca6de4c2 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -59,6 +59,7 @@ distclean: -(cd jemalloc && [ -f Makefile ] && $(MAKE) distclean) > /dev/null || true -(cd hdr_histogram && $(MAKE) clean) > /dev/null || true -(cd fpconv && $(MAKE) clean) > /dev/null || true + -(cd tre && $(MAKE) clean) > /dev/null || true -(cd xxhash && $(MAKE) clean) > /dev/null || true -(rm -f .make-*) @@ -94,6 +95,13 @@ fpconv: .make-prerequisites .PHONY: fpconv +tre: .make-prerequisites + @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) + cd tre && $(MAKE) CFLAGS="$(DEPS_CFLAGS)" LDFLAGS="$(DEPS_LDFLAGS)" + +.PHONY: tre + + XXHASH_CFLAGS = -fPIC $(DEPS_CFLAGS) xxhash: .make-prerequisites @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) diff --git a/deps/tre/LICENSE b/deps/tre/LICENSE new file mode 100644 index 000000000..76ea75f40 --- /dev/null +++ b/deps/tre/LICENSE @@ -0,0 +1,29 @@ +This is the license, copyright notice, and disclaimer for TRE, a regex +matching package (library and tools) with support for approximate +matching. + +Copyright (c) 2001-2009 Ville Laurikari +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/deps/tre/Makefile b/deps/tre/Makefile new file mode 100644 index 000000000..507487749 --- /dev/null +++ b/deps/tre/Makefile @@ -0,0 +1,79 @@ +STD= -std=c99 +WARN= -Wall +OPT= -Os + +ifeq ($(SANITIZER),address) + CFLAGS+=-fsanitize=address -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=address +else +ifeq ($(SANITIZER),undefined) + CFLAGS+=-fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=undefined +else +ifeq ($(SANITIZER),thread) + CFLAGS+=-fsanitize=thread -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=thread +else +ifeq ($(SANITIZER),memory) + CFLAGS+=-fsanitize=memory -fsanitize-memory-track-origins=2 -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=memory +endif +endif +endif +endif + +R_CFLAGS= $(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) -DTRE_REGEX_T_FIELD=value -Ilocal_includes -Ilib +R_LDFLAGS= $(LDFLAGS) +DEBUG= -g + +R_CC=$(CC) $(R_CFLAGS) +R_LD=$(CC) $(R_LDFLAGS) + +AR= ar +ARFLAGS= rcs + +TRE_OBJ=lib/regcomp.o lib/regerror.o lib/regexec.o lib/tre-ast.o lib/tre-compile.o \ + lib/tre-filter.o lib/tre-match-backtrack.o lib/tre-match-parallel.o \ + lib/tre-mem.o lib/tre-parse.o lib/tre-stack.o lib/xmalloc.o +TRE_TESTS=tests/retest tests/test-str-source tests/test-literal-opt tests/test-malformed-regn + +libtre.a: $(TRE_OBJ) + $(AR) $(ARFLAGS) $@ $+ + +check: $(TRE_TESTS) + @set -e; \ + for test in $(TRE_TESTS); do \ + echo "TEST $$test"; \ + ./$$test; \ + done + +tests/retest: tests/retest.c libtre.a + $(R_LD) $(R_CFLAGS) -DHAVE_REGNEXEC -DHAVE_REGNCOMP -o $@ $< libtre.a + +tests/test-str-source: tests/test-str-source.c libtre.a + $(R_LD) $(R_CFLAGS) -o $@ $< libtre.a + +tests/test-literal-opt: tests/test-literal-opt.c libtre.a + $(R_LD) $(R_CFLAGS) -o $@ $< libtre.a + +tests/test-malformed-regn: tests/test-malformed-regn.c libtre.a + $(R_LD) $(R_CFLAGS) -o $@ $< libtre.a + +lib/regcomp.o: lib/regcomp.c local_includes/tre.h local_includes/tre-config.h lib/tre-internal.h lib/xmalloc.h +lib/regerror.o: lib/regerror.c local_includes/tre.h +lib/regexec.o: lib/regexec.c local_includes/tre.h lib/tre-internal.h lib/xmalloc.h +lib/tre-ast.o: lib/tre-ast.c lib/tre-ast.h lib/tre-internal.h +lib/tre-compile.o: lib/tre-compile.c lib/tre-compile.h lib/tre-internal.h lib/tre-mem.h lib/tre-parse.h lib/tre-stack.h lib/xmalloc.h +lib/tre-filter.o: lib/tre-filter.c lib/tre-filter.h lib/tre-internal.h +lib/tre-match-backtrack.o: lib/tre-match-backtrack.c lib/tre-internal.h lib/tre-match-utils.h lib/tre-mem.h lib/tre-stack.h +lib/tre-match-parallel.o: lib/tre-match-parallel.c lib/tre-internal.h lib/tre-match-utils.h lib/tre-mem.h +lib/tre-mem.o: lib/tre-mem.c lib/tre-mem.h +lib/tre-parse.o: lib/tre-parse.c lib/tre-ast.h lib/tre-compile.h lib/tre-filter.h lib/tre-internal.h lib/tre-mem.h lib/tre-parse.h lib/tre-stack.h lib/xmalloc.h +lib/tre-stack.o: lib/tre-stack.c lib/tre-internal.h lib/tre-stack.h +lib/xmalloc.o: lib/xmalloc.c lib/xmalloc.h + +.c.o: + $(R_CC) -c -o $@ $< + +clean: + rm -f $(TRE_OBJ) libtre.a $(TRE_TESTS) diff --git a/deps/tre/README.md b/deps/tre/README.md new file mode 100644 index 000000000..b2e09bbcb --- /dev/null +++ b/deps/tre/README.md @@ -0,0 +1,276 @@ +Introduction +============ + +TRE is a lightweight, robust, and efficient POSIX compliant regexp +matching library with some exciting features such as approximate +(fuzzy) matching. + +The matching algorithm used in TRE uses linear worst-case time in +the length of the text being searched, and quadratic worst-case +time in the length of the used regular expression. + +In other words, the time complexity of the algorithm is O(M^2N), where +M is the length of the regular expression and N is the length of the +text. The used space is also quadratic on the length of the regex, +but does not depend on the searched string. This quadratic behaviour +occurs only on pathological cases which are probably very rare in +practice. + + +Hacking +======= + +Here's how to work with this code. + +Prerequisites +------------- + +You will need the following tools installed on your system: + + - autoconf + - automake + - gettext (including autopoint) + - libtool + - zip (optional) + + +Building +-------- + +First, prepare the tree. Change to the root of the source directory +and run + + ./utils/autogen.sh + +This will regenerate various things using the prerequisite tools so +that you end up with a buildable tree. + +After this, you can run the configure script and build TRE as usual: + + ./configure + make + make check + make install + + +Building a source code package +------------------------------ + +In a prepared tree, this command creates a source code tarball: + + ./configure && make dist + +Alternatively, you can run + + ./utils/build-sources.sh + +which builds the source code packages and puts them in the `dist` +subdirectory. This script needs a working `zip` command. + + +Features +======== + +TRE is not just yet another regexp matcher. TRE has some features +which are not there in most free POSIX compatible implementations. +Most of these features are not present in non-free implementations +either, for that matter. + +Approximate matching +-------------------- + +Approximate pattern matching allows matches to be approximate, that +is, allows the matches to be close to the searched pattern under some +measure of closeness. TRE uses the edit-distance measure (also known +as the Levenshtein distance) where characters can be inserted, +deleted, or substituted in the searched text in order to get an exact +match. + +Each insertion, deletion, or substitution adds the distance, or cost, +of the match. TRE can report the matches which have a cost lower than +some given threshold value. TRE can also be used to search for +matches with the lowest cost. + +TRE includes a version of the agrep (approximate grep) command line +tool for approximate regexp matching in the style of grep. Unlike +other agrep implementations (like the one by Sun Wu and Udi Manber +from University of Arizona) TRE agrep allows full regexps of any +length, any number of errors, and non-uniform costs for insertion, +deletion and substitution. + +Strict standard conformance +--------------------------- + +POSIX defines the behaviour of regexp functions precisely. TRE +attempts to conform to these specifications as strictly as possible. +TRE always returns the correct matches for subpatterns, for example. +Very few other implementations do this correctly. In fact, the only +other implementations besides TRE that I am aware of (free or not) +that get it right are Rx by Tom Lord, Regex++ by John Maddock, and the +AT&T ast regex by Glenn Fowler and Doug McIlroy. + +The standard TRE tries to conform to is the IEEE Std 1003.1-2001, or +Open Group Base Specifications Issue 6, commonly referred to as +“POSIX”. The relevant parts are the base specifications on regular +expressions (and the rationale) and the description of the `regcomp()` +API. + +For an excellent survey on POSIX regexp matchers, see the testregex +pages by Glenn Fowler of AT&T Labs Research. + +Predictable matching speed +-------------------------- + +Because of the matching algorithm used in TRE, the maximum time +consumed by any `regexec()` call is always directly proportional to +the length of the searched string. There is one exception: if back +references are used, the matching may take time that grows +exponentially with the length of the string. This is because matching +back references is an NP complete problem, and almost certainly +requires exponential time to match in the worst case. + +Predictable and modest memory consumption +----------------------------------------- + +A `regexec()` call never allocates memory from the heap. TRE allocates +all the memory it needs during a `regcomp()` call, and some temporary +working space from the stack frame for the duration of the `regexec()` +call. The amount of temporary space needed is constant during +matching and does not depend on the searched string. For regexps of +reasonable size TRE needs less than 50K of dynamically allocated +memory during the `regcomp()` call, less than 20K for the compiled +pattern buffer, and less than two kilobytes of temporary working space +from the stack frame during a `regexec()` call. There is no time / +memory tradeoff. TRE is also small in code size; statically linking +with TRE increases the executable size less than 30K (gcc-3.2, x86, +GNU/Linux). + +Wide character and multibyte character set support +-------------------------------------------------- + +TRE supports multibyte character sets. This makes it possible to use +regexps seamlessly with, for example, Japanese locales. TRE also +provides a wide character API. + +Binary pattern and data support +------------------------------- + +TRE provides APIs which allow binary zero characters both in regexps +and searched strings. The standard API cannot be easily used to, for +example, search for printable words from binary data (although it is +possible with some hacking). Searching for patterns which contain +binary zeroes embedded is not possible at all with the standard API. + +Completely thread safe +---------------------- + +TRE is completely thread safe. All the exported functions are +re-entrant, and a single compiled regexp object can be used +simultaneously in multiple contexts; e.g. in `main()` and a signal +handler, or in many threads of a multithreaded application. + +Portable +-------- + +TRE is portable across multiple platforms. Below is a table of +platforms and compilers used to develop and test TRE: + + + + + + + +
Platform Compiler
FreeBSD 14.1 Clang 18
Ubuntu 22.04 GCC 11
macOS 14.6 Clang 14
Windows 11 Microsoft Visual Studio 2022
+ +TRE should compile without changes on most modern POSIX-like +platforms, and be easily portable to any platform with a hosted C +implementation. + +Depending on the platform, you may need to install libutf8 to get +wide character and multibyte character set support. + +Free +---- + +TRE is released under a license which is essentially the same as the +“2 clause” BSD-style license used in NetBSD. See the file LICENSE for +details. + +Roadmap +------- + +There are currently two features, both related to collating elements, +missing from 100% POSIX compliance. These are: + +* Support for collating elements (e.g. `[[.\.]]`, where `\` is a + collating element). It is not possible to support multi-character + collating elements portably, since POSIX does not define a way to + determine whether a character sequence is a multi-character + collating element or not. + +* Support for equivalence classes, for example `[[=\=]]`, where + `\` is a collating element. An equivalence class matches any + character which has the same primary collation weight as `\`. + Again, POSIX provides no portable mechanism for determining the + primary collation weight of a collating element. + +Note that other portable regexp implementations don't support +collating elements either. The single exception is Regex++, which +comes with its own database for collating elements for different +locales. Support for collating elements and equivalence classes has +not been widely requested and is not very high on the TODO list at the +moment. + +These are other features I'm planning to implement real soon now: + +* All the missing GNU extensions enabled in GNU regex, such as + `[[:<:]]` and `[[:>:]]`. + +* A `REG_SHORTEST` `regexec()` flag for returning the shortest match + instead of the longest match. + +* Perl-compatible syntax: + * `[:^class:]` + Matches anything but the characters in class. Note that + `[^[:class:]]` works already, this would be just a convenience + shorthand. + + * `\A` + Match only at beginning of string. + + * `\Z` + Match only at end of string, or before newline at the end. + + * `\z` + Match only at end of string. + + * `\l` + Lowercase next char (think vi). + + * `\u` + Uppercase next char (think vi). + + * `\L` + Lowercase till `\E` (think vi). + + * `\U` + Uppercase till `\E` (think vi). + + * `(?=pattern)` + Zero-width positive look-ahead assertions. + + * `(?!pattern)` + Zero-width negative look-ahead assertions. + + * `(?<=pattern)` + Zero-width positive look-behind assertions. + + * `(? +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include + +#include "tre-internal.h" +#include "xmalloc.h" + +int +tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) +{ + int ret; + if (n > TRE_MAX_RE) + return REG_ESPACE; +#if TRE_WCHAR + tre_char_t *wregex; + size_t wlen; + + wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); + if (wregex == NULL) + return REG_ESPACE; + + /* If the current locale uses the standard single byte encoding of + characters, we don't do a multibyte string conversion. If we did, + many applications which use the default locale would break since + the default "C" locale uses the 7-bit ASCII character set, and + all characters with the eighth bit set would be considered invalid. */ +#if TRE_MULTIBYTE + if (TRE_MB_CUR_MAX == 1) +#endif /* TRE_MULTIBYTE */ + { + size_t i; + const unsigned char *str = (const unsigned char *)regex; + tre_char_t *wstr = wregex; + + for (i = 0; i < n; i++) + *(wstr++) = *(str++); + wlen = n; + } +#if TRE_MULTIBYTE + else + { + size_t consumed; + tre_char_t *wcptr = wregex; +#ifdef HAVE_MBSTATE_T + mbstate_t state; + memset(&state, '\0', sizeof(state)); +#endif /* HAVE_MBSTATE_T */ + while (n > 0) + { + consumed = tre_mbrtowc(wcptr, regex, n, &state); + + switch (consumed) + { + case 0: + if (*regex == '\0') + consumed = 1; + else + { + xfree(wregex); + return REG_BADPAT; + } + break; + case -1: + DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno))); + xfree(wregex); + return REG_BADPAT; + case -2: + /* The last character wasn't complete. Let's not call it a + fatal error. */ + consumed = n; + break; + } + regex += consumed; + n -= consumed; + wcptr++; + } + wlen = wcptr - wregex; + } +#endif /* TRE_MULTIBYTE */ + + wregex[wlen] = L'\0'; + ret = tre_compile(preg, wregex, wlen, cflags); + xfree(wregex); +#else /* !TRE_WCHAR */ + ret = tre_compile(preg, (const tre_char_t *)regex, n, cflags); +#endif /* !TRE_WCHAR */ + + return ret; +} + +/* this version takes bytes literally, to be used with raw vectors */ +int +tre_regncompb(regex_t *preg, const char *regex, size_t n, int cflags) +{ + int ret; + if (n > TRE_MAX_RE) + return REG_ESPACE; +#if TRE_WCHAR /* wide chars = we need to convert it all to the wide format */ + tre_char_t *wregex; + size_t i; + + wregex = xmalloc(sizeof(tre_char_t) * n); + if (wregex == NULL) + return REG_ESPACE; + + for (i = 0; i < n; i++) + wregex[i] = (tre_char_t) ((unsigned char) regex[i]); + + ret = tre_compile(preg, wregex, n, cflags | REG_USEBYTES); + xfree(wregex); +#else /* !TRE_WCHAR */ + ret = tre_compile(preg, (const tre_char_t *)regex, n, cflags | REG_USEBYTES); +#endif /* !TRE_WCHAR */ + + return ret; +} + +int +tre_regcomp(regex_t *preg, const char *regex, int cflags) +{ + size_t n = regex ? strlen(regex) : 0; + if (n > TRE_MAX_RE) + return REG_ESPACE; + return tre_regncomp(preg, regex, n, cflags); +} + +int +tre_regcompb(regex_t *preg, const char *regex, int cflags) +{ + int ret; + tre_char_t *wregex; + size_t i, n = regex ? strlen(regex) : 0; + const unsigned char *str = (const unsigned char *)regex; + tre_char_t *wstr; + + if (n > TRE_MAX_RE) + return REG_ESPACE; + wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); + if (wregex == NULL) return REG_ESPACE; + wstr = wregex; + + for (i = 0; i < n; i++) + *(wstr++) = *(str++); + wregex[n] = L'\0'; + ret = tre_compile(preg, wregex, n, cflags | REG_USEBYTES); + xfree(wregex); + return ret; +} + + +#ifdef TRE_WCHAR +int +tre_regwncomp(regex_t *preg, const wchar_t *regex, size_t n, int cflags) +{ + if (n > TRE_MAX_RE) + return REG_ESPACE; + return tre_compile(preg, regex, n, cflags); +} + +int +tre_regwcomp(regex_t *preg, const wchar_t *regex, int cflags) +{ + size_t n = regex ? wcslen(regex) : 0; + if (n > TRE_MAX_RE) + return REG_ESPACE; + return tre_compile(preg, regex, n, cflags); +} +#endif /* TRE_WCHAR */ + +void +tre_regfree(regex_t *preg) +{ + tre_free(preg); +} + +/* EOF */ diff --git a/deps/tre/lib/regerror.c b/deps/tre/lib/regerror.c new file mode 100644 index 000000000..2f8326ce7 --- /dev/null +++ b/deps/tre/lib/regerror.c @@ -0,0 +1,86 @@ +/* + tre_regerror.c - POSIX tre_regerror() implementation for TRE. + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#include +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ +#ifdef HAVE_WCTYPE_H +#include +#endif /* HAVE_WCTYPE_H */ + +#include "tre-internal.h" + +#ifdef HAVE_GETTEXT +#include +#else +#define dgettext(p, s) s +#define gettext(s) s +#endif + +#define _(String) dgettext(PACKAGE, String) +#define gettext_noop(String) String + +#define xstr(s) str(s) +#define str(s) #s + +/* Error message strings for error codes listed in `tre.h'. This list + needs to be in sync with the codes listed there, naturally. */ +static const char *tre_error_messages[] = + { gettext_noop("No error"), /* REG_OK */ + gettext_noop("No match"), /* REG_NOMATCH */ + gettext_noop("Invalid regexp"), /* REG_BADPAT */ + gettext_noop("Unknown collating element"), /* REG_ECOLLATE */ + gettext_noop("Unknown character class name"), /* REG_ECTYPE */ + gettext_noop("Trailing backslash"), /* REG_EESCAPE */ + gettext_noop("Invalid back reference"), /* REG_ESUBREG */ + gettext_noop("Missing ']'"), /* REG_EBRACK */ + gettext_noop("Missing ')'"), /* REG_EPAREN */ + gettext_noop("Missing '}'"), /* REG_EBRACE */ + gettext_noop("Invalid contents of {}"), /* REG_BADBR */ + gettext_noop("Invalid character range"), /* REG_ERANGE */ + gettext_noop("Out of memory"), /* REG_ESPACE */ + gettext_noop("Invalid use of repetition operators"), /* REG_BADRPT */ + gettext_noop("Maximum repetition in {} larger than " xstr(RE_DUP_MAX)), /* REG_BADMAX */ + }; + +size_t +tre_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +{ + const char *err; + size_t err_len; + + /*LINTED*/(void)&preg; + if (errcode >= 0 + && errcode < (int)(sizeof(tre_error_messages) + / sizeof(*tre_error_messages))) + err = gettext(tre_error_messages[errcode]); + else + err = gettext("Unknown error"); + + err_len = strlen(err) + 1; + if (errbuf_size > 0 && errbuf != NULL) + { + if (err_len > errbuf_size) + { + strncpy(errbuf, err, errbuf_size - 1); + errbuf[errbuf_size - 1] = '\0'; + } + else + { + strcpy(errbuf, err); + } + } + return err_len; +} + +/* EOF */ diff --git a/deps/tre/lib/regexec.c b/deps/tre/lib/regexec.c new file mode 100644 index 000000000..c70eb70a4 --- /dev/null +++ b/deps/tre/lib/regexec.c @@ -0,0 +1,584 @@ +/* + tre_regexec.c - TRE POSIX compatible matching functions (and more). + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#ifdef TRE_USE_ALLOCA +/* AIX requires this to be the first thing in the file. */ +#ifndef __GNUC__ +# if HAVE_ALLOCA_H +# include +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +char *alloca (); +# endif +# endif +# endif +#endif +#endif /* TRE_USE_ALLOCA */ + +#include +#include +#include +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ +#ifdef HAVE_WCTYPE_H +#include +#endif /* HAVE_WCTYPE_H */ +#ifndef TRE_WCHAR +#include +#endif /* !TRE_WCHAR */ +#ifdef HAVE_MALLOC_H +#include +#endif /* HAVE_MALLOC_H */ +#include + +#include "tre-internal.h" +#include "xmalloc.h" + +/* Literal alternatives are grouped by the first byte so the matcher can + * reach the relevant candidates in O(1). In nocase mode the lookup uses the + * same folded byte mapping that was applied at compile time. */ +static void +tre_litopt_candidate_range(const tre_literal_opt_t *opt, unsigned char first_byte, + size_t *start, size_t *end) +{ + unsigned char key = opt->nocase ? opt->fold_map[first_byte] : first_byte; + *start = opt->start_offsets[key]; + *end = opt->start_offsets[key + 1]; +} + +static int +tre_litopt_bytes_equal(const unsigned char *haystack, + const unsigned char *needle, size_t len, + const unsigned char *fold_map) +{ + size_t i; + + if (fold_map == NULL) + return memcmp(haystack, needle, len) == 0; + + for (i = 0; i < len; i++) + if (fold_map[haystack[i]] != needle[i]) + return 0; + return 1; +} + +static int +tre_litopt_contains_case(const unsigned char *haystack, size_t hay_len, + const unsigned char *needle, size_t needle_len, + int *match_end_ofs) +{ + const unsigned char *p; + size_t remaining; + + if (needle_len > hay_len) + return 0; + + p = haystack; + remaining = hay_len; + while (remaining >= needle_len) + { + p = memchr(p, needle[0], remaining - needle_len + 1); + if (p == NULL) + return 0; + if (memcmp(p, needle, needle_len) == 0) + { + if (match_end_ofs != NULL) + *match_end_ofs = (int)(p - haystack + needle_len); + return 1; + } + remaining = hay_len - (size_t)(p - haystack) - 1; + p++; + } + return 0; +} + +/* Nocase substring matching is still byte-oriented, but scanning once and + * only checking literals that share the same folded first byte avoids the + * old O(haystack * literals) restart pattern. */ +static int +tre_litopt_contains_nocase(const tre_literal_opt_t *opt, + const unsigned char *haystack, size_t hay_len, + int *match_end_ofs) +{ + size_t i, start, end, j; + + for (i = 0; i < hay_len; i++) + { + tre_litopt_candidate_range(opt, haystack[i], &start, &end); + for (j = start; j < end; j++) + { + const tre_literal_opt_literal_t *lit = &opt->literals[j]; + if (lit->len <= hay_len - i + && tre_litopt_bytes_equal(haystack + i, lit->data, lit->len, + opt->fold_map)) + { + if (match_end_ofs != NULL) + *match_end_ofs = (int)(i + lit->len); + return 1; + } + } + } + return 0; +} + +static reg_errcode_t +tre_match_literal_opt(const tre_tnfa_t *tnfa, const char *string, size_t len, + int eflags, int *match_end_ofs) +{ + const tre_literal_opt_t *opt = &tnfa->literal_opt; + const unsigned char *haystack = (const unsigned char *)string; + size_t start = 0, end = opt->num_literals, i; + const unsigned char *fold_map = opt->nocase ? opt->fold_map : NULL; + + if ((opt->mode == TRE_LITERAL_OPT_PREFIX + || opt->mode == TRE_LITERAL_OPT_EXACT) + && (eflags & REG_NOTBOL)) + return REG_NOMATCH; + if ((opt->mode == TRE_LITERAL_OPT_SUFFIX + || opt->mode == TRE_LITERAL_OPT_EXACT) + && (eflags & REG_NOTEOL)) + return REG_NOMATCH; + + if ((opt->mode == TRE_LITERAL_OPT_EXACT + || opt->mode == TRE_LITERAL_OPT_PREFIX) + && len > 0) + tre_litopt_candidate_range(opt, haystack[0], &start, &end); + + if (opt->mode == TRE_LITERAL_OPT_CONTAINS) + { + if (opt->nocase) + return tre_litopt_contains_nocase(opt, haystack, len, match_end_ofs) + ? REG_OK : REG_NOMATCH; + + for (i = 0; i < opt->num_literals; i++) + { + const tre_literal_opt_literal_t *lit = &opt->literals[i]; + if (tre_litopt_contains_case(haystack, len, lit->data, lit->len, + match_end_ofs)) + return REG_OK; + } + return REG_NOMATCH; + } + + for (i = start; i < end; i++) + { + const tre_literal_opt_literal_t *lit = &opt->literals[i]; + + switch (opt->mode) + { + case TRE_LITERAL_OPT_EXACT: + if (len == lit->len + && tre_litopt_bytes_equal(haystack, lit->data, len, fold_map)) + { + if (match_end_ofs != NULL) + *match_end_ofs = (int)len; + return REG_OK; + } + break; + + case TRE_LITERAL_OPT_PREFIX: + if (len >= lit->len + && tre_litopt_bytes_equal(haystack, lit->data, lit->len, + fold_map)) + { + if (match_end_ofs != NULL) + *match_end_ofs = (int)lit->len; + return REG_OK; + } + break; + + case TRE_LITERAL_OPT_SUFFIX: + if (len >= lit->len + && tre_litopt_bytes_equal(haystack + len - lit->len, lit->data, + lit->len, fold_map)) + { + if (match_end_ofs != NULL) + *match_end_ofs = (int)len; + return REG_OK; + } + break; + + case TRE_LITERAL_OPT_CONTAINS: + case TRE_LITERAL_OPT_NONE: + break; + } + } + + return REG_NOMATCH; +} + + +/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match + endpoint values. */ +void +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, + const tre_tnfa_t *tnfa, int *tags, int match_eo) +{ + tre_submatch_data_t *submatch_data; + unsigned int i, j; + int *parents; + + i = 0; + if (match_eo >= 0 && !(cflags & REG_NOSUB)) + { + /* Construct submatch offsets from the tags. */ + DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); + submatch_data = tnfa->submatch_data; + while (i < tnfa->num_submatches && i < nmatch) + { + if (submatch_data[i].so_tag == tnfa->end_tag) + pmatch[i].rm_so = match_eo; + else + pmatch[i].rm_so = tags[submatch_data[i].so_tag]; + + if (submatch_data[i].eo_tag == tnfa->end_tag) + pmatch[i].rm_eo = match_eo; + else + pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; + + /* If either of the endpoints were not used, this submatch + was not part of the match. */ + if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + + DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, + submatch_data[i].so_tag, pmatch[i].rm_so, + submatch_data[i].eo_tag, pmatch[i].rm_eo)); + i++; + } + /* Reset all submatches that are not within all of their parent + submatches. */ + i = 0; + while (i < tnfa->num_submatches && i < nmatch) + { + if (pmatch[i].rm_eo == -1) + assert(pmatch[i].rm_so == -1); + assert(pmatch[i].rm_so <= pmatch[i].rm_eo); + + parents = submatch_data[i].parents; + if (parents != NULL) + for (j = 0; parents[j] >= 0; j++) + { + DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); + if (pmatch[i].rm_so < pmatch[parents[j]].rm_so + || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + } + i++; + } + } + + while (i < nmatch) + { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + i++; + } +} + + +/* + Wrapper functions for POSIX compatible regexp matching. +*/ + +int +tre_have_backrefs(const regex_t *preg) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tnfa->have_backrefs; +} + +int +tre_have_approx(const regex_t *preg) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tnfa->have_approx; +} + +static int +tre_match(const tre_tnfa_t *tnfa, const void *string, ssize_t len, + tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], + int eflags) +{ + reg_errcode_t status; + int *tags = NULL, eo; + if (tnfa->num_tags > 0 && nmatch > 0) + { +#ifdef TRE_USE_ALLOCA + tags = alloca(sizeof(*tags) * tnfa->num_tags); +#else /* !TRE_USE_ALLOCA */ + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); +#endif /* !TRE_USE_ALLOCA */ + if (tags == NULL) + return REG_ESPACE; + } + + if (type == STR_BYTE + && tnfa->literal_opt.mode != TRE_LITERAL_OPT_NONE + && (nmatch == 0 || (tnfa->cflags & REG_NOSUB)) +#ifdef TRE_APPROX + && !(eflags & REG_APPROX_MATCHER) +#endif /* TRE_APPROX */ + && !(eflags & REG_BACKTRACKING_MATCHER)) + { + size_t byte_len = (len >= 0) ? (size_t)len : strlen((const char *)string); + status = tre_match_literal_opt(tnfa, string, byte_len, eflags, &eo); + + /* Even when the caller asked for no submatches, regexec() still has to + * clear any pmatch entries it was handed. The normal matcher path does + * this through tre_fill_pmatch(), so mirror that behavior here. */ + if (status == REG_OK && nmatch > 0) + tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, NULL, eo); + +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); +#endif /* !TRE_USE_ALLOCA */ + return status; + } + + /* Dispatch to the appropriate matcher. */ + if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) + { + /* The regex has back references, use the backtracking matcher. */ + if (type == STR_USER) + { + const tre_str_source *source = string; + if (source->rewind == NULL || source->compare == NULL) + { + /* The backtracking matcher requires rewind and compare + capabilities from the input stream. */ +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); +#endif /* !TRE_USE_ALLOCA */ + return REG_BADPAT; + } + } + status = tre_tnfa_run_backtrack(tnfa, string, len, type, + tags, eflags, &eo); + } +#ifdef TRE_APPROX + else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) + { + /* The regex uses approximate matching, use the approximate matcher. */ + regamatch_t match; + regaparams_t params; + tre_regaparams_default(¶ms); + params.max_err = 0; + params.max_cost = 0; + status = tre_tnfa_run_approx(tnfa, string, len, type, tags, + &match, params, eflags, &eo); + } +#endif /* TRE_APPROX */ + else + { + /* Exact matching, no back references, use the parallel matcher. */ + status = tre_tnfa_run_parallel(tnfa, string, len, type, + tags, eflags, &eo); + } + + if (status == REG_OK) + /* A match was found, so fill the submatch registers. */ + tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); +#endif /* !TRE_USE_ALLOCA */ + return status; +} + +int +tre_regnexec(const regex_t *preg, const char *str, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + + return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); +} + +#ifdef TRE_USE_GNUC_REGEXEC_FPL +int +tre_regexec(const regex_t *preg, const char *str, + size_t nmatch, regmatch_t pmatch[_Restrict_arr_ _REGEX_NELTS (nmatch)], + int eflags) +#else +int +tre_regexec(const regex_t *preg, const char *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +#endif +{ + return tre_regnexec(preg, str, -1, nmatch, pmatch, eflags); +} + +int +tre_regexecb(const regex_t *preg, const char *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + + return tre_match(tnfa, str, -1, STR_BYTE, nmatch, pmatch, eflags); +} + +int +tre_regnexecb(const regex_t *preg, const char *str, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + + return tre_match(tnfa, str, len, STR_BYTE, nmatch, pmatch, eflags); +} + + +#ifdef TRE_WCHAR + +int +tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); +} + +int +tre_regwexec(const regex_t *preg, const wchar_t *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + return tre_regwnexec(preg, str, -1, nmatch, pmatch, eflags); +} + +#endif /* TRE_WCHAR */ + +int +tre_reguexec(const regex_t *preg, const tre_str_source *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tre_match(tnfa, str, -1, STR_USER, nmatch, pmatch, eflags); +} + + +#ifdef TRE_APPROX + +/* + Wrapper functions for approximate regexp matching. +*/ + +static int +tre_match_approx(const tre_tnfa_t *tnfa, const void *string, ssize_t len, + tre_str_type_t type, regamatch_t *match, regaparams_t params, + int eflags) +{ + reg_errcode_t status; + int *tags = NULL, eo; + + /* If the regexp does not use approximate matching features, the + maximum cost is zero, and the approximate matcher isn't forced, + use the exact matcher instead. */ + if (params.max_cost == 0 && !tnfa->have_approx + && !(eflags & REG_APPROX_MATCHER)) + return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, + eflags); + + /* Back references are not supported by the approximate matcher. */ + if (tnfa->have_backrefs) + return REG_BADPAT; + + if (tnfa->num_tags > 0 && match->nmatch > 0) + { +#if TRE_USE_ALLOCA + tags = alloca(sizeof(*tags) * tnfa->num_tags); +#else /* !TRE_USE_ALLOCA */ + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); +#endif /* !TRE_USE_ALLOCA */ + if (tags == NULL) + return REG_ESPACE; + } + status = tre_tnfa_run_approx(tnfa, string, len, type, tags, + match, params, eflags, &eo); + if (status == REG_OK) + tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo); +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); +#endif /* !TRE_USE_ALLOCA */ + return status; +} + +int +tre_reganexec(const regex_t *preg, const char *str, size_t len, + regamatch_t *match, regaparams_t params, int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + + return tre_match_approx(tnfa, str, len, type, match, params, eflags); +} + +int +tre_regaexec(const regex_t *preg, const char *str, + regamatch_t *match, regaparams_t params, int eflags) +{ + return tre_reganexec(preg, str, -1, match, params, eflags); +} + +int +tre_regaexecb(const regex_t *preg, const char *str, + regamatch_t *match, regaparams_t params, int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + + return tre_match_approx(tnfa, str, -1, STR_BYTE, match, params, eflags); +} + +#ifdef TRE_WCHAR + +int +tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, + regamatch_t *match, regaparams_t params, int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tre_match_approx(tnfa, str, len, STR_WIDE, + match, params, eflags); +} + +int +tre_regawexec(const regex_t *preg, const wchar_t *str, + regamatch_t *match, regaparams_t params, int eflags) +{ + return tre_regawnexec(preg, str, -1, match, params, eflags); +} + +#endif /* TRE_WCHAR */ + +void +tre_regaparams_default(regaparams_t *params) +{ + memset(params, 0, sizeof(*params)); + params->cost_ins = 1; + params->cost_del = 1; + params->cost_subst = 1; + params->max_cost = INT_MAX; + params->max_ins = INT_MAX; + params->max_del = INT_MAX; + params->max_subst = INT_MAX; + params->max_err = INT_MAX; +} + +#endif /* TRE_APPROX */ + +/* EOF */ diff --git a/deps/tre/lib/tre-ast.c b/deps/tre/lib/tre-ast.c new file mode 100644 index 000000000..5a4bb1940 --- /dev/null +++ b/deps/tre/lib/tre-ast.c @@ -0,0 +1,226 @@ +/* + tre-ast.c - Abstract syntax tree (AST) routines + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include + +#include "tre-ast.h" +#include "tre-mem.h" + +tre_ast_node_t * +tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) +{ + tre_ast_node_t *node; + + node = tre_mem_calloc(mem, sizeof(*node)); + if (!node) + return NULL; + node->obj = tre_mem_calloc(mem, size); + if (!node->obj) + return NULL; + node->type = type; + node->nullable = -1; + node->submatch_id = -1; + + return node; +} + +tre_ast_node_t * +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max) +{ + tre_ast_node_t *node; + tre_literal_t *lit; + + node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); + if (!node) + return NULL; + lit = node->obj; + lit->code_min = code_min; + lit->code_max = code_max; + lit->position = -1; + + return node; +} + +tre_ast_node_t * +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, + int minimal) +{ + tre_ast_node_t *node; + tre_iteration_t *iter; + + node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); + if (!node) + return NULL; + iter = node->obj; + iter->arg = arg; + iter->min = min; + iter->max = max; + iter->minimal = minimal; + node->num_submatches = arg->num_submatches; + + return node; +} + +tre_ast_node_t * +tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) +{ + tre_ast_node_t *node; + + node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); + if (node == NULL) + return NULL; + ((tre_union_t *)node->obj)->left = left; + ((tre_union_t *)node->obj)->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + + return node; +} + +tre_ast_node_t * +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, + tre_ast_node_t *right) +{ + tre_ast_node_t *node; + + node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); + if (node == NULL) + return NULL; + ((tre_catenation_t *)node->obj)->left = left; + ((tre_catenation_t *)node->obj)->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + + return node; +} + +#ifdef TRE_DEBUG + +static void +tre_findent(FILE *stream, int i) +{ + while (i-- > 0) + fputc(' ', stream); +} + +void +tre_print_params(int *params) +{ + int i; + if (params) + { + DPRINT(("params [")); + for (i = 0; i < TRE_PARAM_LAST; i++) + { + if (params[i] == TRE_PARAM_UNSET) + DPRINT(("unset")); + else if (params[i] == TRE_PARAM_DEFAULT) + DPRINT(("default")); + else + DPRINT(("%d", params[i])); + if (i < TRE_PARAM_LAST - 1) + DPRINT((", ")); + } + DPRINT(("]")); + } +} + +static void +tre_do_print(FILE *stream, tre_ast_node_t *ast, int indent) +{ + int code_min, code_max, pos; + int num_tags = ast->num_tags; + tre_literal_t *lit; + tre_iteration_t *iter; + + tre_findent(stream, indent); + switch (ast->type) + { + case LITERAL: + lit = ast->obj; + code_min = lit->code_min; + code_max = lit->code_max; + pos = lit->position; + if (IS_EMPTY(lit)) + { + fprintf(stream, "literal empty\n"); + } + else if (IS_ASSERTION(lit)) + { + int i; + char *assertions[] = { "bol", "eol", "ctype", "!ctype", + "bow", "eow", "wb", "!wb" }; + if (code_max >= ASSERT_LAST << 1) + assert(0); + fprintf(stream, "assertions: "); + for (i = 0; (1 << i) <= ASSERT_LAST; i++) + if (code_max & (1 << i)) + fprintf(stream, "%s ", assertions[i]); + fprintf(stream, "\n"); + } + else if (IS_TAG(lit)) + { + fprintf(stream, "tag %d\n", code_max); + } + else if (IS_BACKREF(lit)) + { + fprintf(stream, "backref %d, pos %d\n", code_max, pos); + } + else if (IS_PARAMETER(lit)) + { + tre_print_params(lit->u.params); + fprintf(stream, "\n"); + } + else + { + fprintf(stream, "literal (%c, %c) (%d, %d), pos %d, sub %d, " + "%d tags\n", code_min, code_max, code_min, code_max, pos, + ast->submatch_id, num_tags); + } + break; + case ITERATION: + iter = ast->obj; + fprintf(stream, "iteration {%d, %d}, sub %d, %d tags, %s\n", + iter->min, iter->max, ast->submatch_id, num_tags, + iter->minimal ? "minimal" : "greedy"); + tre_do_print(stream, iter->arg, indent + 2); + break; + case UNION: + fprintf(stream, "union, sub %d, %d tags\n", ast->submatch_id, num_tags); + tre_do_print(stream, ((tre_union_t *)ast->obj)->left, indent + 2); + tre_do_print(stream, ((tre_union_t *)ast->obj)->right, indent + 2); + break; + case CATENATION: + fprintf(stream, "catenation, sub %d, %d tags\n", ast->submatch_id, + num_tags); + tre_do_print(stream, ((tre_catenation_t *)ast->obj)->left, indent + 2); + tre_do_print(stream, ((tre_catenation_t *)ast->obj)->right, indent + 2); + break; + default: + assert(0); + break; + } +} + +static void +tre_ast_fprint(FILE *stream, tre_ast_node_t *ast) +{ + tre_do_print(stream, ast, 0); +} + +void +tre_ast_print(tre_ast_node_t *tree) +{ + printf("AST:\n"); + tre_ast_fprint(stdout, tree); +} + +#endif /* TRE_DEBUG */ + +/* EOF */ diff --git a/deps/tre/lib/tre-ast.h b/deps/tre/lib/tre-ast.h new file mode 100644 index 000000000..190c4b033 --- /dev/null +++ b/deps/tre/lib/tre-ast.h @@ -0,0 +1,128 @@ +/* + tre-ast.h - Abstract syntax tree (AST) definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + + +#ifndef TRE_AST_H +#define TRE_AST_H 1 + +#include "tre-mem.h" +#include "tre-internal.h" +#include "tre-compile.h" + +/* The different AST node types. */ +typedef enum { + LITERAL, + CATENATION, + ITERATION, + UNION +} tre_ast_type_t; + +/* Special subtypes of TRE_LITERAL. */ +#define EMPTY -1 /* Empty leaf (denotes empty string). */ +#define ASSERTION -2 /* Assertion leaf. */ +#define TAG -3 /* Tag leaf. */ +#define BACKREF -4 /* Back reference leaf. */ +#define PARAMETER -5 /* Parameter. */ + +#define IS_SPECIAL(x) ((x)->code_min < 0) +#define IS_EMPTY(x) ((x)->code_min == EMPTY) +#define IS_ASSERTION(x) ((x)->code_min == ASSERTION) +#define IS_TAG(x) ((x)->code_min == TAG) +#define IS_BACKREF(x) ((x)->code_min == BACKREF) +#define IS_PARAMETER(x) ((x)->code_min == PARAMETER) + + +/* A generic AST node. All AST nodes consist of this node on the top + level with `obj' pointing to the actual content. */ +typedef struct { + tre_ast_type_t type; /* Type of the node. */ + void *obj; /* Pointer to actual node. */ + int nullable; + int submatch_id; + unsigned int num_submatches; + unsigned int num_tags; + tre_pos_and_tags_t *firstpos; + tre_pos_and_tags_t *lastpos; +} tre_ast_node_t; + + +/* A "literal" node. These are created for assertions, back references, + tags, matching parameter settings, and all expressions that match one + character. */ +typedef struct { + long code_min; + long code_max; + int position; + union { + tre_ctype_t class; + int *params; + } u; + tre_ctype_t *neg_classes; +} tre_literal_t; + +/* A "catenation" node. These are created when two regexps are concatenated. + If there are more than one subexpressions in sequence, the `left' part + holds all but the last, and `right' part holds the last subexpression + (catenation is left associative). */ +typedef struct { + tre_ast_node_t *left; + tre_ast_node_t *right; +} tre_catenation_t; + +/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}" + operators. */ +typedef struct { + /* Subexpression to match. */ + tre_ast_node_t *arg; + /* Minimum number of consecutive matches. */ + int min; + /* Maximum number of consecutive matches. */ + int max; + /* If 0, match as many characters as possible, if 1 match as few as + possible. Note that this does not always mean the same thing as + matching as many/few repetitions as possible. */ + unsigned int minimal:1; + /* Approximate matching parameters (or NULL). */ + int *params; +} tre_iteration_t; + +/* An "union" node. These are created for the "|" operator. */ +typedef struct { + tre_ast_node_t *left; + tre_ast_node_t *right; +} tre_union_t; + +tre_ast_node_t * +tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size); + +tre_ast_node_t * +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max); + +tre_ast_node_t * +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, + int minimal); + +tre_ast_node_t * +tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right); + +tre_ast_node_t * +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, + tre_ast_node_t *right); + +#ifdef TRE_DEBUG +void +tre_ast_print(tre_ast_node_t *tree); + +/* XXX - rethink AST printing API */ +void +tre_print_params(int *params); +#endif /* TRE_DEBUG */ + +#endif /* TRE_AST_H */ + +/* EOF */ diff --git a/deps/tre/lib/tre-compile.c b/deps/tre/lib/tre-compile.c new file mode 100644 index 000000000..a3573df5a --- /dev/null +++ b/deps/tre/lib/tre-compile.c @@ -0,0 +1,2673 @@ +/* + tre-compile.c - TRE regex compiler + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + TODO: + - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive + function calls. +*/ + + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include +#include + +#include "tre-internal.h" +#include "tre-mem.h" +#include "tre-stack.h" +#include "tre-ast.h" +#include "tre-parse.h" +#include "tre-compile.h" +#include "xmalloc.h" + +typedef struct { + const tre_ast_node_t **nodes; + size_t len; + size_t cap; +} tre_ast_node_vec_t; + +typedef struct { + unsigned char *bytes; + size_t len; + size_t cap; +} tre_literal_byte_buf_t; + +static unsigned char +tre_litopt_fold_byte(unsigned char c) +{ + return (unsigned char)tre_tolower((tre_cint_t)c); +} + +static void +tre_litopt_free_literal_list(tre_literal_opt_literal_t *literals, size_t count) +{ + size_t i; + + if (literals == NULL) + return; + for (i = 0; i < count; i++) + if (literals[i].data != NULL) + xfree(literals[i].data); + xfree(literals); +} + +static void +tre_litopt_reset_byte_buf(tre_literal_byte_buf_t *buf) +{ + if (buf->bytes != NULL) + xfree(buf->bytes); + buf->bytes = NULL; + buf->len = 0; + buf->cap = 0; +} + +static int +tre_litopt_append_ast_node(tre_ast_node_vec_t *vec, const tre_ast_node_t *node) +{ + const tre_ast_node_t **new_nodes; + size_t new_cap; + + if (vec->len == vec->cap) + { + new_cap = vec->cap ? vec->cap * 2 : 8; + new_nodes = xrealloc(vec->nodes, sizeof(*new_nodes) * new_cap); + if (new_nodes == NULL) + return REG_ESPACE; + vec->nodes = new_nodes; + vec->cap = new_cap; + } + + vec->nodes[vec->len++] = node; + return REG_OK; +} + +static int +tre_litopt_append_byte(tre_literal_byte_buf_t *buf, unsigned char byte) +{ + unsigned char *new_bytes; + size_t new_cap; + + if (buf->len == buf->cap) + { + new_cap = buf->cap ? buf->cap * 2 : 8; + new_bytes = xrealloc(buf->bytes, new_cap); + if (new_bytes == NULL) + return REG_ESPACE; + buf->bytes = new_bytes; + buf->cap = new_cap; + } + + buf->bytes[buf->len++] = byte; + return REG_OK; +} + +static int +tre_litopt_append_literal(tre_literal_opt_t *opt, + const tre_literal_byte_buf_t *buf) +{ + tre_literal_opt_literal_t *new_literals; + unsigned char *copy; + size_t new_count; + + new_count = opt->num_literals + 1; + new_literals = xrealloc(opt->literals, sizeof(*new_literals) * new_count); + if (new_literals == NULL) + return REG_ESPACE; + opt->literals = new_literals; + + copy = xmalloc(buf->len); + if (copy == NULL) + return REG_ESPACE; + memcpy(copy, buf->bytes, buf->len); + + opt->literals[opt->num_literals].data = copy; + opt->literals[opt->num_literals].len = buf->len; + opt->num_literals = new_count; + return REG_OK; +} + +/* Fill the fold table once and group literals by the first byte so the + * matcher can jump straight to the small set of candidates that can match + * at a given position. */ +static reg_errcode_t +tre_litopt_prepare(tre_literal_opt_t *opt) +{ + size_t counts[256] = { 0 }; + size_t next[256]; + tre_literal_opt_literal_t *grouped; + size_t i; + + for (i = 0; i < 256; i++) + opt->fold_map[i] = tre_litopt_fold_byte((unsigned char)i); + + memset(opt->start_offsets, 0, sizeof(opt->start_offsets)); + if (opt->num_literals == 0) + return REG_OK; + + for (i = 0; i < opt->num_literals; i++) + counts[opt->literals[i].data[0]]++; + + for (i = 0; i < 256; i++) + opt->start_offsets[i + 1] = opt->start_offsets[i] + counts[i]; + + grouped = xmalloc(sizeof(*grouped) * opt->num_literals); + if (grouped == NULL) + return REG_ESPACE; + + memcpy(next, opt->start_offsets, sizeof(next)); + for (i = 0; i < opt->num_literals; i++) + { + unsigned char first = opt->literals[i].data[0]; + grouped[next[first]++] = opt->literals[i]; + } + + xfree(opt->literals); + opt->literals = grouped; + return REG_OK; +} + +static int +tre_litopt_is_simple_literal(const tre_ast_node_t *node, unsigned char *byte) +{ + tre_literal_t *lit; + + if (node == NULL || node->type != LITERAL) + return 0; + lit = node->obj; + if (IS_SPECIAL(lit) || lit->code_min != lit->code_max) + return 0; + if (lit->code_min < 0 || lit->code_min > UCHAR_MAX) + return 0; + *byte = (unsigned char)lit->code_min; + return 1; +} + +static int +tre_litopt_is_icase_char_union(const tre_ast_node_t *node, int cflags, + unsigned char *byte) +{ + tre_union_t *uni; + unsigned char left, right; + + if (!(cflags & REG_ICASE) || node == NULL || node->type != UNION) + return 0; + + uni = node->obj; + if (!tre_litopt_is_simple_literal(uni->left, &left) + || !tre_litopt_is_simple_literal(uni->right, &right)) + return 0; + + if (tre_litopt_fold_byte(left) != tre_litopt_fold_byte(right)) + return 0; + + *byte = tre_litopt_fold_byte(left); + return 1; +} + +static int +tre_litopt_is_assertion(const tre_ast_node_t *node, int assertion) +{ + tre_literal_t *lit; + + if (node == NULL || node->type != LITERAL) + return 0; + lit = node->obj; + return IS_ASSERTION(lit) && lit->code_max == assertion; +} + +static int +tre_litopt_collect_cat_nodes(const tre_ast_node_t *node, tre_ast_node_vec_t *vec) +{ + tre_catenation_t *cat; + int err; + + if (node->type != CATENATION) + return tre_litopt_append_ast_node(vec, node); + + cat = node->obj; + err = tre_litopt_collect_cat_nodes(cat->left, vec); + if (err != REG_OK) + return err; + return tre_litopt_collect_cat_nodes(cat->right, vec); +} + +static int +tre_litopt_collect_alt_nodes(const tre_ast_node_t *node, int cflags, + tre_ast_node_vec_t *vec) +{ + tre_union_t *uni; + unsigned char byte; + int err; + + if (node->type != UNION || tre_litopt_is_icase_char_union(node, cflags, &byte)) + return tre_litopt_append_ast_node(vec, node); + + uni = node->obj; + err = tre_litopt_collect_alt_nodes(uni->left, cflags, vec); + if (err != REG_OK) + return err; + return tre_litopt_collect_alt_nodes(uni->right, cflags, vec); +} + +static int +tre_litopt_collect_literal_string(const tre_ast_node_t *node, int cflags, + tre_literal_byte_buf_t *buf) +{ + tre_catenation_t *cat; + unsigned char byte; + int err; + + switch (node->type) + { + case CATENATION: + cat = node->obj; + err = tre_litopt_collect_literal_string(cat->left, cflags, buf); + if (err != 1) + return err; + return tre_litopt_collect_literal_string(cat->right, cflags, buf); + + case LITERAL: + if (!tre_litopt_is_simple_literal(node, &byte)) + return 0; + if (cflags & REG_ICASE) + byte = tre_litopt_fold_byte(byte); + return tre_litopt_append_byte(buf, byte) == REG_OK ? 1 : -1; + + case UNION: + if (!tre_litopt_is_icase_char_union(node, cflags, &byte)) + return 0; + return tre_litopt_append_byte(buf, byte) == REG_OK ? 1 : -1; + + default: + return 0; + } +} + +static reg_errcode_t +tre_litopt_try_compile(tre_tnfa_t *tnfa, const tre_ast_node_t *tree, + int cflags, int mb_cur_max) +{ + tre_ast_node_vec_t pieces = { 0 }, alts = { 0 }; + tre_literal_byte_buf_t buf = { 0 }; + tre_literal_opt_t opt = { 0 }; + size_t first, last, i; + int err; + + if (mb_cur_max != 1 || (cflags & REG_NEWLINE)) + return REG_OK; + + err = tre_litopt_collect_cat_nodes(tree, &pieces); + if (err != REG_OK) + goto error; + + first = 0; + last = pieces.len; + + if (first < last && tre_litopt_is_assertion(pieces.nodes[first], ASSERT_AT_BOL)) + first++; + if (first < last && tre_litopt_is_assertion(pieces.nodes[last - 1], ASSERT_AT_EOL)) + last--; + + if (first == last) + goto out; + + if (last - first == 1) + { + err = tre_litopt_collect_alt_nodes(pieces.nodes[first], cflags, &alts); + if (err != REG_OK) + goto error; + + for (i = 0; i < alts.len; i++) + { + err = tre_litopt_collect_literal_string(alts.nodes[i], cflags, &buf); + if (err < 0) + goto error; + if (err == 0 || buf.len == 0) + goto out; + err = tre_litopt_append_literal(&opt, &buf); + if (err != REG_OK) + goto error; + buf.len = 0; + } + } + else + { + for (i = first; i < last; i++) + { + err = tre_litopt_collect_literal_string(pieces.nodes[i], cflags, &buf); + if (err < 0) + goto error; + if (err == 0) + goto out; + } + if (buf.len == 0) + goto out; + err = tre_litopt_append_literal(&opt, &buf); + if (err != REG_OK) + goto error; + buf.len = 0; + } + + if (opt.num_literals == 0) + goto out; + + if (first > 0 && last < pieces.len) + opt.mode = TRE_LITERAL_OPT_EXACT; + else if (first > 0) + opt.mode = TRE_LITERAL_OPT_PREFIX; + else if (last < pieces.len) + opt.mode = TRE_LITERAL_OPT_SUFFIX; + else + opt.mode = TRE_LITERAL_OPT_CONTAINS; + opt.nocase = !!(cflags & REG_ICASE); + err = tre_litopt_prepare(&opt); + if (err != REG_OK) + goto error; + + tnfa->literal_opt = opt; + opt.literals = NULL; + opt.num_literals = 0; + + out: + if (pieces.nodes != NULL) + xfree(pieces.nodes); + if (alts.nodes != NULL) + xfree(alts.nodes); + tre_litopt_reset_byte_buf(&buf); + tre_litopt_free_literal_list(opt.literals, opt.num_literals); + return REG_OK; + + error: + if (pieces.nodes != NULL) + xfree(pieces.nodes); + if (alts.nodes != NULL) + xfree(alts.nodes); + tre_litopt_reset_byte_buf(&buf); + tre_litopt_free_literal_list(opt.literals, opt.num_literals); + return REG_ESPACE; +} + +/* + Algorithms to setup tags so that submatch addressing can be done. +*/ + + +/* Inserts a catenation node to the root of the tree given in `node'. + As the left child a new tag with number `tag_id' to `node' is added, + and the right child is the old root. */ +static reg_errcode_t +tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) +{ + tre_catenation_t *c; + + DPRINT(("add_tag_left: tag %d\n", tag_id)); + + c = tre_mem_alloc(mem, sizeof(*c)); + if (c == NULL) + return REG_ESPACE; + c->left = tre_ast_new_literal(mem, TAG, tag_id); + if (c->left == NULL) + return REG_ESPACE; + c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + if (c->right == NULL) + return REG_ESPACE; + + c->right->obj = node->obj; + c->right->type = node->type; + c->right->nullable = -1; + c->right->submatch_id = -1; + c->right->firstpos = NULL; + c->right->lastpos = NULL; + c->right->num_tags = 0; + node->obj = c; + node->type = CATENATION; + return REG_OK; +} + +/* Inserts a catenation node to the root of the tree given in `node'. + As the right child a new tag with number `tag_id' to `node' is added, + and the left child is the old root. */ +static reg_errcode_t +tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) +{ + tre_catenation_t *c; + + DPRINT(("tre_add_tag_right: tag %d\n", tag_id)); + + c = tre_mem_alloc(mem, sizeof(*c)); + if (c == NULL) + return REG_ESPACE; + c->right = tre_ast_new_literal(mem, TAG, tag_id); + if (c->right == NULL) + return REG_ESPACE; + c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + if (c->left == NULL) + return REG_ESPACE; + + c->left->obj = node->obj; + c->left->type = node->type; + c->left->nullable = -1; + c->left->submatch_id = -1; + c->left->firstpos = NULL; + c->left->lastpos = NULL; + c->left->num_tags = 0; + node->obj = c; + node->type = CATENATION; + return REG_OK; +} + +typedef enum { + ADDTAGS_RECURSE, + ADDTAGS_AFTER_ITERATION, + ADDTAGS_AFTER_UNION_LEFT, + ADDTAGS_AFTER_UNION_RIGHT, + ADDTAGS_AFTER_CAT_LEFT, + ADDTAGS_AFTER_CAT_RIGHT, + ADDTAGS_SET_SUBMATCH_END +} tre_addtags_symbol_t; + + +typedef struct { + int tag; + int next_tag; +} tre_tag_states_t; + + +/* Go through `regset' and set submatch data for submatches that are + using this tag. */ +static void +tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) +{ + int i; + + for (i = 0; regset[i] >= 0; i++) + { + int id = regset[i] / 2; + int start = !(regset[i] % 2); + DPRINT((" Using tag %d for %s offset of " + "submatch %d\n", tag, + start ? "start" : "end", id)); + if (start) + tnfa->submatch_data[id].so_tag = tag; + else + tnfa->submatch_data[id].eo_tag = tag; + } + regset[0] = -1; +} + + +/* Adds tags to appropriate locations in the parse tree in `tree', so that + subexpressions marked for submatch addressing can be traced. */ +static reg_errcode_t +tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, + tre_tnfa_t *tnfa) +{ + reg_errcode_t status = REG_OK; + tre_addtags_symbol_t symbol; + tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */ + size_t bottom = tre_stack_num_items(stack); + /* True for first pass (counting number of needed tags) */ + int first_pass = (mem == NULL || tnfa == NULL); + int *regset, *orig_regset; + unsigned int num_tags = 0; /* Total number of tags. */ + unsigned int num_minimals = 0; /* Number of special minimal tags. */ + unsigned int tag = 0; /* The tag that is to be added next. */ + unsigned int next_tag = 1; /* Next tag to use after this one. */ + int *parents; /* Stack of submatches the current submatch is + contained in. */ + int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ + tre_tag_states_t *saved_states; + + tre_tag_direction_t direction = TRE_TAG_MINIMIZE; + if (!first_pass) + { + tnfa->end_tag = 0; + tnfa->minimal_tags[0] = -1; + } + + regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2)); + if (regset == NULL) + return REG_ESPACE; + regset[0] = -1; + orig_regset = regset; + + parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1)); + if (parents == NULL) + { + xfree(regset); + return REG_ESPACE; + } + parents[0] = -1; + + saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1)); + if (saved_states == NULL) + { + xfree(regset); + xfree(parents); + return REG_ESPACE; + } + else + { + unsigned int i; + for (i = 0; i <= tnfa->num_submatches; i++) + saved_states[i].tag = -1; + } + + STACK_PUSH(stack, voidptr, node); + STACK_PUSH(stack, int, ADDTAGS_RECURSE); + + while (status == REG_OK && tre_stack_num_items(stack) > bottom) + { + symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); + switch (symbol) + { + + case ADDTAGS_SET_SUBMATCH_END: + { + int id = tre_stack_pop_int(stack); + int i; + + /* Add end of this submatch to regset. */ + for (i = 0; regset[i] >= 0; i++); + regset[i] = id * 2 + 1; + regset[i + 1] = -1; + + /* Pop this submatch from the parents stack. */ + for (i = 0; parents[i] >= 0; i++); + parents[i - 1] = -1; + break; + } + + case ADDTAGS_RECURSE: + node = tre_stack_pop_voidptr(stack); + + if (node->submatch_id >= 0) + { + int id = node->submatch_id; + int i; + + + /* Add start of this submatch to regset. */ + for (i = 0; regset[i] >= 0; i++); + regset[i] = id * 2; + regset[i + 1] = -1; + + if (!first_pass) + { + for (i = 0; parents[i] >= 0; i++); + tnfa->submatch_data[id].parents = NULL; + if (i > 0) + { + int *p = xmalloc(sizeof(*p) * (i + 1)); + if (p == NULL) + { + status = REG_ESPACE; + break; + } + assert(tnfa->submatch_data[id].parents == NULL); + tnfa->submatch_data[id].parents = p; + for (i = 0; parents[i] >= 0; i++) + p[i] = parents[i]; + p[i] = -1; + } + } + + /* Add end of this submatch to regset after processing this + node. */ + STACK_PUSHX(stack, int, node->submatch_id); + STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); + } + + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = node->obj; + + if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) + { + int i; + DPRINT(("Literal %d-%d\n", + (int)lit->code_min, (int)lit->code_max)); + if (regset[0] >= 0) + { + /* Regset is not empty, so add a tag before the + literal or backref. */ + if (!first_pass) + { + status = tre_add_tag_left(mem, node, tag); + tnfa->tag_directions[tag] = direction; + if (minimal_tag >= 0) + { + DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; + } + tre_purge_regset(regset, tnfa, tag); + } + else + { + DPRINT((" num_tags = 1\n")); + node->num_tags = 1; + } + + DPRINT((" num_tags++\n")); + regset[0] = -1; + tag = next_tag; + num_tags++; + next_tag++; + } + } + else + { + assert(!IS_TAG(lit)); + } + break; + } + case CATENATION: + { + tre_catenation_t *cat = node->obj; + tre_ast_node_t *left = cat->left; + tre_ast_node_t *right = cat->right; + int reserved_tag = -1; + DPRINT(("Catenation, next_tag = %d\n", next_tag)); + + + /* After processing right child. */ + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT); + + /* Process right child. */ + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + /* After processing left child. */ + STACK_PUSHX(stack, int, next_tag + left->num_tags); + DPRINT((" Pushing %d for after left\n", + next_tag + left->num_tags)); + if (left->num_tags > 0 && right->num_tags > 0) + { + /* Reserve the next tag to the right child. */ + DPRINT((" Reserving next_tag %d to right child\n", + next_tag)); + reserved_tag = next_tag; + next_tag++; + } + STACK_PUSHX(stack, int, reserved_tag); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT); + + /* Process left child. */ + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + } + break; + case ITERATION: + { + tre_iteration_t *iter = node->obj; + DPRINT(("Iteration\n")); + + if (first_pass) + { + STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal); + } + else + { + STACK_PUSHX(stack, int, tag); + STACK_PUSHX(stack, int, iter->minimal); + } + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); + + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + /* Regset is not empty, so add a tag here. */ + if (regset[0] >= 0 || iter->minimal) + { + if (!first_pass) + { + int i; + status = tre_add_tag_left(mem, node, tag); + if (iter->minimal) + tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; + else + tnfa->tag_directions[tag] = direction; + if (minimal_tag >= 0) + { + DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; + } + tre_purge_regset(regset, tnfa, tag); + } + + DPRINT((" num_tags++\n")); + regset[0] = -1; + tag = next_tag; + num_tags++; + next_tag++; + } + direction = TRE_TAG_MINIMIZE; + } + break; + case UNION: + { + tre_union_t *uni = node->obj; + tre_ast_node_t *left = uni->left; + tre_ast_node_t *right = uni->right; + int left_tag; + int right_tag; + + if (regset[0] >= 0) + { + left_tag = next_tag; + right_tag = next_tag + 1; + } + else + { + left_tag = tag; + right_tag = next_tag; + } + + DPRINT(("Union\n")); + + /* After processing right child. */ + STACK_PUSHX(stack, int, right_tag); + STACK_PUSHX(stack, int, left_tag); + STACK_PUSHX(stack, voidptr, regset); + STACK_PUSHX(stack, int, regset[0] >= 0); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); + + /* Process right child. */ + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + /* After processing left child. */ + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); + + /* Process left child. */ + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + /* Regset is not empty, so add a tag here. */ + if (regset[0] >= 0) + { + if (!first_pass) + { + int i; + status = tre_add_tag_left(mem, node, tag); + tnfa->tag_directions[tag] = direction; + if (minimal_tag >= 0) + { + DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; + } + tre_purge_regset(regset, tnfa, tag); + } + + DPRINT((" num_tags++\n")); + regset[0] = -1; + tag = next_tag; + num_tags++; + next_tag++; + } + + if (node->num_submatches > 0) + { + /* The next two tags are reserved for markers. */ + next_tag++; + tag = next_tag; + next_tag++; + } + + break; + } + } + + if (node->submatch_id >= 0) + { + int i; + /* Push this submatch on the parents stack. */ + for (i = 0; parents[i] >= 0; i++); + parents[i] = node->submatch_id; + parents[i + 1] = -1; + } + + break; /* end case: ADDTAGS_RECURSE */ + + case ADDTAGS_AFTER_ITERATION: + { + int minimal = 0; + int enter_tag; + node = tre_stack_pop_voidptr(stack); + if (first_pass) + { + node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags + + tre_stack_pop_int(stack); + minimal_tag = -1; + } + else + { + minimal = tre_stack_pop_int(stack); + enter_tag = tre_stack_pop_int(stack); + if (minimal) + minimal_tag = enter_tag; + } + + DPRINT(("After iteration\n")); + if (!first_pass) + { + DPRINT((" Setting direction to %s\n", + minimal ? "minimize" : "maximize")); + if (minimal) + direction = TRE_TAG_MINIMIZE; + else + direction = TRE_TAG_MAXIMIZE; + } + break; + } + + case ADDTAGS_AFTER_CAT_LEFT: + { + int new_tag = tre_stack_pop_int(stack); + next_tag = tre_stack_pop_int(stack); + DPRINT(("After cat left, tag = %d, next_tag = %d\n", + tag, next_tag)); + if (new_tag >= 0) + { + DPRINT((" Setting tag to %d\n", new_tag)); + tag = new_tag; + } + break; + } + + case ADDTAGS_AFTER_CAT_RIGHT: + DPRINT(("After cat right\n")); + node = tre_stack_pop_voidptr(stack); + if (first_pass) + node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags + + ((tre_catenation_t *)node->obj)->right->num_tags; + break; + + case ADDTAGS_AFTER_UNION_LEFT: + DPRINT(("After union left\n")); + /* Lift the bottom of the `regset' array so that when processing + the right operand the items currently in the array are + invisible. The original bottom was saved at ADDTAGS_UNION and + will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ + while (*regset >= 0) + regset++; + break; + + case ADDTAGS_AFTER_UNION_RIGHT: + { + int added_tags, tag_left, tag_right; + tre_ast_node_t *left = tre_stack_pop_voidptr(stack); + tre_ast_node_t *right = tre_stack_pop_voidptr(stack); + DPRINT(("After union right\n")); + node = tre_stack_pop_voidptr(stack); + added_tags = tre_stack_pop_int(stack); + if (first_pass) + { + node->num_tags = ((tre_union_t *)node->obj)->left->num_tags + + ((tre_union_t *)node->obj)->right->num_tags + added_tags + + ((node->num_submatches > 0) ? 2 : 0); + } + regset = tre_stack_pop_voidptr(stack); + tag_left = tre_stack_pop_int(stack); + tag_right = tre_stack_pop_int(stack); + + /* Add tags after both children, the left child gets a smaller + tag than the right child. This guarantees that we prefer + the left child over the right child. */ + /* XXX - This is not always necessary (if the children have + tags which must be seen for every match of that child). */ + /* XXX - Check if this is the only place where tre_add_tag_right + is used. If so, use tre_add_tag_left (putting the tag before + the child as opposed after the child) and throw away + tre_add_tag_right. */ + if (node->num_submatches > 0) + { + if (!first_pass) + { + status = tre_add_tag_right(mem, left, tag_left); + tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; + status = tre_add_tag_right(mem, right, tag_right); + tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; + } + DPRINT((" num_tags += 2\n")); + num_tags += 2; + } + direction = TRE_TAG_MAXIMIZE; + break; + } + + default: + assert(0); + break; + + } /* end switch(symbol) */ + } /* end while(tre_stack_num_items(stack) > bottom) */ + + if (!first_pass) + tre_purge_regset(regset, tnfa, tag); + + if (!first_pass && minimal_tag >= 0) + { + int i; + DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; + } + + DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n", + first_pass? "First pass" : "Second pass", num_tags)); + + assert(tree->num_tags == num_tags); + tnfa->end_tag = num_tags; + tnfa->num_tags = num_tags; + tnfa->num_minimals = num_minimals; + xfree(orig_regset); + xfree(parents); + xfree(saved_states); + return status; +} + + + +/* + AST to TNFA compilation routines. +*/ + +typedef enum { + COPY_RECURSE, + COPY_SET_RESULT_PTR +} tre_copyast_symbol_t; + +/* Flags for tre_copy_ast(). */ +#define COPY_REMOVE_TAGS 1 +#define COPY_MAXIMIZE_FIRST_TAG 2 + +static reg_errcode_t +tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, + int flags, int *pos_add, tre_tag_direction_t *tag_directions, + tre_ast_node_t **copy, int *max_pos) +{ + reg_errcode_t status = REG_OK; + size_t bottom = tre_stack_num_items(stack); + int num_copied = 0; + int first_tag = 1; + tre_ast_node_t **result = copy; + tre_copyast_symbol_t symbol; + + STACK_PUSH(stack, voidptr, ast); + STACK_PUSH(stack, int, COPY_RECURSE); + + while (status == REG_OK && tre_stack_num_items(stack) > bottom) + { + tre_ast_node_t *node; + if (status != REG_OK) + break; + + symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack); + switch (symbol) + { + case COPY_SET_RESULT_PTR: + result = tre_stack_pop_voidptr(stack); + break; + case COPY_RECURSE: + node = tre_stack_pop_voidptr(stack); + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = node->obj; + int pos = lit->position; + long min = lit->code_min; + long max = lit->code_max; + if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) + { + /* XXX - e.g. [ab] has only one position but two + nodes, so we are creating holes in the state space + here. Not fatal, just wastes memory. */ + pos += *pos_add; + num_copied++; + } + else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS)) + { + /* Change this tag to empty. */ + min = EMPTY; + max = pos = -1; + } + else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) + && first_tag) + { + /* Maximize the first tag. */ + tag_directions[max] = TRE_TAG_MAXIMIZE; + first_tag = 0; + } + *result = tre_ast_new_literal(mem, min, max); + if (*result == NULL) { + status = REG_ESPACE; + break; + } + if (!IS_SPECIAL(lit)) { + ((tre_literal_t *)(*result)->obj)->u.class = lit->u.class; + ((tre_literal_t *)(*result)->obj)->neg_classes = lit->neg_classes; + } else if (IS_PARAMETER(lit)) { + ((tre_literal_t *)(*result)->obj)->u.params = lit->u.params; + } + + if (pos > *max_pos) + *max_pos = pos; + break; + } + case UNION: + { + tre_union_t *uni = node->obj; + tre_union_t *tmp; + *result = tre_ast_new_union(mem, uni->left, uni->right); + if (*result == NULL) + { + status = REG_ESPACE; + break; + } + tmp = (*result)->obj; + result = &tmp->left; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, COPY_RECURSE); + break; + } + case CATENATION: + { + tre_catenation_t *cat = node->obj; + tre_catenation_t *tmp; + *result = tre_ast_new_catenation(mem, cat->left, cat->right); + if (*result == NULL) + { + status = REG_ESPACE; + break; + } + tmp = (*result)->obj; + tmp->left = NULL; + tmp->right = NULL; + result = &tmp->left; + + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, COPY_RECURSE); + break; + } + case ITERATION: + { + tre_iteration_t *iter = node->obj; + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, COPY_RECURSE); + *result = tre_ast_new_iter(mem, iter->arg, iter->min, + iter->max, iter->minimal); + if (*result == NULL) + { + status = REG_ESPACE; + break; + } + iter = (*result)->obj; + result = &iter->arg; + break; + } + default: + assert(0); + break; + } + break; + } + } + *pos_add += num_copied; + return status; +} + +typedef enum { + EXPAND_RECURSE, + EXPAND_AFTER_ITER +} tre_expand_ast_symbol_t; + +/* Expands each iteration node that has a finite nonzero minimum or maximum + iteration count to a catenated sequence of copies of the node. */ +static reg_errcode_t +tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, + tre_tag_direction_t *tag_directions, int *max_depth) +{ + reg_errcode_t status = REG_OK; + size_t bottom = tre_stack_num_items(stack); + int pos_add = 0; + int pos_add_total = 0; + int max_pos = 0; + /* Current approximate matching parameters. */ + int params[TRE_PARAM_LAST]; + /* Approximate parameter nesting level. */ + int params_depth = 0; + int iter_depth = 0; + int i; + + for (i = 0; i < TRE_PARAM_LAST; i++) + params[i] = TRE_PARAM_DEFAULT; + + STACK_PUSHR(stack, voidptr, ast); + STACK_PUSHR(stack, int, EXPAND_RECURSE); + while (status == REG_OK && tre_stack_num_items(stack) > bottom) + { + tre_ast_node_t *node; + tre_expand_ast_symbol_t symbol; + + if (status != REG_OK) + break; + + DPRINT(("pos_add %d\n", pos_add)); + + symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); + switch (symbol) + { + case EXPAND_RECURSE: + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit= node->obj; + if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) + { + lit->position += pos_add; + if (lit->position > max_pos) + max_pos = lit->position; + } + break; + } + case UNION: + { + tre_union_t *uni = node->obj; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + break; + } + case CATENATION: + { + tre_catenation_t *cat = node->obj; + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + break; + } + case ITERATION: + { + tre_iteration_t *iter = node->obj; + STACK_PUSHX(stack, int, pos_add); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, EXPAND_AFTER_ITER); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + /* If we are going to expand this node at EXPAND_AFTER_ITER + then don't increase the `pos' fields of the nodes now, it + will get done when expanding. */ + if (iter->min > 1 || iter->max > 1) + pos_add = 0; + iter_depth++; + DPRINT(("iter\n")); + break; + } + default: + assert(0); + break; + } + break; + case EXPAND_AFTER_ITER: + { + tre_iteration_t *iter = node->obj; + int pos_add_last; + pos_add = tre_stack_pop_int(stack); + pos_add_last = pos_add; + if (iter->min > 1 || iter->max > 1) + { + tre_ast_node_t *seq1 = NULL, *seq2 = NULL; + int j; + int pos_add_save = pos_add; + + /* Create a catenated sequence of copies of the node. */ + for (j = 0; j < iter->min; j++) + { + tre_ast_node_t *copy; + /* Remove tags from all but the last copy. */ + int flags = ((j + 1 < iter->min) + ? COPY_REMOVE_TAGS + : COPY_MAXIMIZE_FIRST_TAG); + DPRINT((" pos_add %d\n", pos_add)); + pos_add_save = pos_add; + status = tre_copy_ast(mem, stack, iter->arg, flags, + &pos_add, tag_directions, ©, + &max_pos); + if (status != REG_OK) + return status; + if (seq1 != NULL) + seq1 = tre_ast_new_catenation(mem, seq1, copy); + else + seq1 = copy; + if (seq1 == NULL) + return REG_ESPACE; + } + + if (iter->max == -1) + { + /* No upper limit. */ + pos_add_save = pos_add; + status = tre_copy_ast(mem, stack, iter->arg, 0, + &pos_add, NULL, &seq2, &max_pos); + if (status != REG_OK) + return status; + seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0); + if (seq2 == NULL) + return REG_ESPACE; + } + else + { + for (j = iter->min; j < iter->max; j++) + { + tre_ast_node_t *copy; + pos_add_save = pos_add; + status = tre_copy_ast(mem, stack, iter->arg, 0, + &pos_add, NULL, ©, &max_pos); + if (status != REG_OK) + return status; + if (seq2 != NULL) + seq2 = tre_ast_new_catenation(mem, copy, seq2); + else + seq2 = copy; + if (seq2 == NULL) + return REG_ESPACE; + seq2 = tre_ast_new_iter(mem, seq2, 0, 1, 0); + if (seq2 == NULL) + return REG_ESPACE; + } + } + + pos_add = pos_add_save; + if (seq1 == NULL) + seq1 = seq2; + else if (seq2 != NULL) + seq1 = tre_ast_new_catenation(mem, seq1, seq2); + if (seq1 == NULL) + return REG_ESPACE; + node->obj = seq1->obj; + node->type = seq1->type; + } + + iter_depth--; + pos_add_total += pos_add - pos_add_last; + if (iter_depth == 0) + pos_add = pos_add_total; + + /* If approximate parameters are specified, surround the result + with two parameter setting nodes. The one on the left sets + the specified parameters, and the one on the right restores + the old parameters. */ + if (iter->params) + { + tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy; + int *old_params; + + tmp_l = tre_ast_new_literal(mem, PARAMETER, 0); + if (!tmp_l) + return REG_ESPACE; + ((tre_literal_t *)tmp_l->obj)->u.params = iter->params; + iter->params[TRE_PARAM_DEPTH] = params_depth + 1; + tmp_r = tre_ast_new_literal(mem, PARAMETER, 0); + if (!tmp_r) + return REG_ESPACE; + old_params = tre_mem_alloc(mem, sizeof(*old_params) + * TRE_PARAM_LAST); + if (!old_params) + return REG_ESPACE; + for (i = 0; i < TRE_PARAM_LAST; i++) + old_params[i] = params[i]; + ((tre_literal_t *)tmp_r->obj)->u.params = old_params; + old_params[TRE_PARAM_DEPTH] = params_depth; + /* XXX - this is the only place where ast_new_node is + needed -- should be moved inside AST module. */ + node_copy = tre_ast_new_node(mem, ITERATION, + sizeof(tre_iteration_t)); + if (!node_copy) + return REG_ESPACE; + node_copy->obj = node->obj; + tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy); + if (!tmp_node) + return REG_ESPACE; + tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r); + if (!tmp_node) + return REG_ESPACE; + /* Replace the contents of `node' with `tmp_node'. */ + memcpy(node, tmp_node, sizeof(*node)); + node->obj = tmp_node->obj; + node->type = tmp_node->type; + params_depth++; + if (params_depth > *max_depth) + *max_depth = params_depth; + } + break; + } + default: + assert(0); + break; + } + } + +#ifdef TRE_DEBUG + DPRINT(("Expanded AST:\n")); + tre_ast_print(ast); +#endif + + return status; +} + +static tre_pos_and_tags_t * +tre_set_empty(tre_mem_t mem) +{ + tre_pos_and_tags_t *new_set; + + new_set = tre_mem_calloc(mem, sizeof(*new_set)); + if (new_set == NULL) + return NULL; + + new_set[0].position = -1; + new_set[0].code_min = -1; + new_set[0].code_max = -1; + + return new_set; +} + +static tre_pos_and_tags_t * +tre_set_one(tre_mem_t mem, int position, long code_min, long code_max, + tre_ctype_t class, tre_ctype_t *neg_classes, int backref) +{ + tre_pos_and_tags_t *new_set; + + new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2); + if (new_set == NULL) + return NULL; + + new_set[0].position = position; + new_set[0].code_min = code_min; + new_set[0].code_max = code_max; + new_set[0].class = class; + new_set[0].neg_classes = neg_classes; + new_set[0].backref = backref; + new_set[1].position = -1; + new_set[1].code_min = -1; + new_set[1].code_max = -1; + + return new_set; +} + +static tre_pos_and_tags_t * +tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, + int *tags, int assertions, int *params) +{ + int s1, s2, i, j; + tre_pos_and_tags_t *new_set; + int *new_tags; + int num_tags; + + for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++); + for (s1 = 0; set1[s1].position >= 0; s1++); + for (s2 = 0; set2[s2].position >= 0; s2++); + new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1)); + if (!new_set ) + return NULL; + + for (s1 = 0; set1[s1].position >= 0; s1++) + { + new_set[s1].position = set1[s1].position; + new_set[s1].code_min = set1[s1].code_min; + new_set[s1].code_max = set1[s1].code_max; + new_set[s1].assertions = set1[s1].assertions | assertions; + new_set[s1].class = set1[s1].class; + new_set[s1].neg_classes = set1[s1].neg_classes; + new_set[s1].backref = set1[s1].backref; + if (set1[s1].tags == NULL && tags == NULL) + new_set[s1].tags = NULL; + else + { + for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++); + new_tags = tre_mem_alloc(mem, (sizeof(*new_tags) + * (i + num_tags + 1))); + if (new_tags == NULL) + return NULL; + for (j = 0; j < i; j++) + new_tags[j] = set1[s1].tags[j]; + for (i = 0; i < num_tags; i++) + new_tags[j + i] = tags[i]; + new_tags[j + i] = -1; + new_set[s1].tags = new_tags; + } + if (set1[s1].params) + new_set[s1].params = set1[s1].params; + if (params) + { + if (!new_set[s1].params) + new_set[s1].params = params; + else + { + new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) * + TRE_PARAM_LAST); + if (!new_set[s1].params) + return NULL; + for (i = 0; i < TRE_PARAM_LAST; i++) + if (params[i] != TRE_PARAM_UNSET) + new_set[s1].params[i] = params[i]; + } + } + } + + for (s2 = 0; set2[s2].position >= 0; s2++) + { + new_set[s1 + s2].position = set2[s2].position; + new_set[s1 + s2].code_min = set2[s2].code_min; + new_set[s1 + s2].code_max = set2[s2].code_max; + /* XXX - why not | assertions here as well? */ + new_set[s1 + s2].assertions = set2[s2].assertions; + new_set[s1 + s2].class = set2[s2].class; + new_set[s1 + s2].neg_classes = set2[s2].neg_classes; + new_set[s1 + s2].backref = set2[s2].backref; + if (set2[s2].tags == NULL) + new_set[s1 + s2].tags = NULL; + else + { + for (i = 0; set2[s2].tags[i] >= 0; i++); + new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1)); + if (new_tags == NULL) + return NULL; + for (j = 0; j < i; j++) + new_tags[j] = set2[s2].tags[j]; + new_tags[j] = -1; + new_set[s1 + s2].tags = new_tags; + } + if (set2[s2].params) + new_set[s1 + s2].params = set2[s2].params; + if (params) + { + if (!new_set[s1 + s2].params) + new_set[s1 + s2].params = params; + else + { + new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) * + TRE_PARAM_LAST); + if (!new_set[s1 + s2].params) + return NULL; + for (i = 0; i < TRE_PARAM_LAST; i++) + if (params[i] != TRE_PARAM_UNSET) + new_set[s1 + s2].params[i] = params[i]; + } + } + } + new_set[s1 + s2].position = -1; + return new_set; +} + +/* Finds the empty path through `node' which is the one that should be + taken according to POSIX.2 rules, and adds the tags on that path to + `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is + set to the number of tags seen on the path. */ +static reg_errcode_t +tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, + int *assertions, int *params, int *num_tags_seen, + int *params_seen) +{ + tre_literal_t *lit; + tre_union_t *uni; + tre_catenation_t *cat; + tre_iteration_t *iter; + int i; + size_t bottom = tre_stack_num_items(stack); + reg_errcode_t status = REG_OK; + if (num_tags_seen) + *num_tags_seen = 0; + if (params_seen) + *params_seen = 0; + + status = tre_stack_push_voidptr(stack, node); + + /* Walk through the tree recursively. */ + while (status == REG_OK && tre_stack_num_items(stack) > bottom) + { + node = tre_stack_pop_voidptr(stack); + + switch (node->type) + { + case LITERAL: + lit = (tre_literal_t *)node->obj; + switch (lit->code_min) + { + case TAG: + if (lit->code_max >= 0) + { + if (tags != NULL) + { + /* Add the tag to `tags'. */ + for (i = 0; tags[i] >= 0; i++) + if (tags[i] == lit->code_max) + break; + if (tags[i] < 0) + { + tags[i] = lit->code_max; + tags[i + 1] = -1; + } + } + if (num_tags_seen) + (*num_tags_seen)++; + } + break; + case ASSERTION: + assert(lit->code_max >= 1 && lit->code_max <= ASSERT_LAST); + if (assertions != NULL) + *assertions |= lit->code_max; + break; + case PARAMETER: + if (params != NULL) + for (i = 0; i < TRE_PARAM_LAST; i++) + params[i] = lit->u.params[i]; + if (params_seen != NULL) + *params_seen = 1; + break; + case EMPTY: + break; + default: + assert(0); + break; + } + break; + + case UNION: + /* Subexpressions starting earlier take priority over ones + starting later, so we prefer the left subexpression over the + right subexpression. */ + uni = (tre_union_t *)node->obj; + if (uni->left->nullable) + STACK_PUSHX(stack, voidptr, uni->left) + else if (uni->right->nullable) + STACK_PUSHX(stack, voidptr, uni->right) + else + assert(0); + break; + + case CATENATION: + /* The path must go through both children. */ + cat = (tre_catenation_t *)node->obj; + assert(cat->left->nullable); + assert(cat->right->nullable); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, voidptr, cat->right); + break; + + case ITERATION: + /* A match with an empty string is preferred over no match at + all, so we go through the argument if possible. */ + iter = (tre_iteration_t *)node->obj; + if (iter->arg->nullable) + STACK_PUSHX(stack, voidptr, iter->arg); + break; + + default: + assert(0); + break; + } + } + + return status; +} + + +typedef enum { + NPFL_RECURSE, + NPFL_POST_UNION, + NPFL_POST_CATENATION, + NPFL_POST_ITERATION +} tre_npfl_stack_symbol_t; + + +/* Computes and fills in the fields `nullable', `position`, `firstpos', + and `lastpos' for the nodes of the AST `tree'; `nextpos' points to an + integer indicating the next available position, and will be updated on + return to reflect the number of additional positions assigned. */ +static reg_errcode_t +tre_compute_npfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, + int *nextpos) +{ + size_t bottom = tre_stack_num_items(stack); + + STACK_PUSHR(stack, voidptr, tree); + STACK_PUSHR(stack, int, NPFL_RECURSE); + + while (tre_stack_num_items(stack) > bottom) + { + tre_npfl_stack_symbol_t symbol; + tre_ast_node_t *node; + + symbol = (tre_npfl_stack_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); + switch (symbol) + { + case NPFL_RECURSE: + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = (tre_literal_t *)node->obj; + if (IS_BACKREF(lit)) + { + /* Back references: nullable = false, firstpos = {i}, + lastpos = {i}. */ + node->nullable = 0; + lit->position = (*nextpos)++; + node->firstpos = tre_set_one(mem, lit->position, 0, + TRE_CHAR_MAX, 0, NULL, -1); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_one(mem, lit->position, 0, + TRE_CHAR_MAX, 0, NULL, + lit->code_max); + if (!node->lastpos) + return REG_ESPACE; + } + else if (lit->code_min < 0) + { + /* Tags, empty strings, params, and zero width assertions: + nullable = true, firstpos = {}, and lastpos = {}. */ + node->nullable = 1; + node->firstpos = tre_set_empty(mem); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_empty(mem); + if (!node->lastpos) + return REG_ESPACE; + } + else + { + /* Literal at position i: nullable = false, firstpos = {i}, + lastpos = {i}. */ + node->nullable = 0; + lit->position = (*nextpos)++; + node->firstpos = + tre_set_one(mem, lit->position, lit->code_min, + lit->code_max, 0, NULL, -1); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_one(mem, lit->position, + lit->code_min, + lit->code_max, + lit->u.class, lit->neg_classes, + -1); + if (!node->lastpos) + return REG_ESPACE; + } + break; + } + + case UNION: + /* Compute the attributes for the two subtrees, and after that + for this node. */ + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NPFL_POST_UNION); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); + STACK_PUSHR(stack, int, NPFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); + STACK_PUSHR(stack, int, NPFL_RECURSE); + break; + + case CATENATION: + /* Compute the attributes for the two subtrees, and after that + for this node. */ + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NPFL_POST_CATENATION); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); + STACK_PUSHR(stack, int, NPFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); + STACK_PUSHR(stack, int, NPFL_RECURSE); + break; + + case ITERATION: + /* Compute the attributes for the subtree, and after that for + this node. */ + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NPFL_POST_ITERATION); + STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); + STACK_PUSHR(stack, int, NPFL_RECURSE); + break; + } + break; /* end case: NPFL_RECURSE */ + + case NPFL_POST_UNION: + { + tre_union_t *uni = (tre_union_t *)node->obj; + node->nullable = uni->left->nullable || uni->right->nullable; + node->firstpos = tre_set_union(mem, uni->left->firstpos, + uni->right->firstpos, NULL, 0, NULL); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_union(mem, uni->left->lastpos, + uni->right->lastpos, NULL, 0, NULL); + if (!node->lastpos) + return REG_ESPACE; + break; + } + + case NPFL_POST_ITERATION: + { + tre_iteration_t *iter = (tre_iteration_t *)node->obj; + + if (iter->min == 0 || iter->arg->nullable) + node->nullable = 1; + else + node->nullable = 0; + node->firstpos = iter->arg->firstpos; + node->lastpos = iter->arg->lastpos; + break; + } + + case NPFL_POST_CATENATION: + { + int num_tags, *tags, assertions, params_seen; + int *params; + reg_errcode_t status; + tre_catenation_t *cat = node->obj; + node->nullable = cat->left->nullable && cat->right->nullable; + + /* Compute firstpos. */ + if (cat->left->nullable) + { + /* The left side matches the empty string. Make a first pass + with tre_match_empty() to get the number of tags and + parameters. */ + status = tre_match_empty(stack, cat->left, + NULL, NULL, NULL, &num_tags, + ¶ms_seen); + if (status != REG_OK) + return status; + /* Allocate arrays for the tags and parameters. */ + tags = xmalloc(sizeof(*tags) * (num_tags + 1)); + if (!tags) + return REG_ESPACE; + tags[0] = -1; + assertions = 0; + params = NULL; + if (params_seen) + { + params = tre_mem_alloc(mem, sizeof(*params) + * TRE_PARAM_LAST); + if (!params) + { + xfree(tags); + return REG_ESPACE; + } + } + /* Second pass with tre_mach_empty() to get the list of + tags and parameters. */ + status = tre_match_empty(stack, cat->left, tags, + &assertions, params, NULL, NULL); + if (status != REG_OK) + { + xfree(tags); + return status; + } + node->firstpos = + tre_set_union(mem, cat->right->firstpos, cat->left->firstpos, + tags, assertions, params); + xfree(tags); + if (!node->firstpos) + return REG_ESPACE; + } + else + { + node->firstpos = cat->left->firstpos; + } + + /* Compute lastpos. */ + if (cat->right->nullable) + { + /* The right side matches the empty string. Make a first pass + with tre_match_empty() to get the number of tags and + parameters. */ + status = tre_match_empty(stack, cat->right, + NULL, NULL, NULL, &num_tags, + ¶ms_seen); + if (status != REG_OK) + return status; + /* Allocate arrays for the tags and parameters. */ + tags = xmalloc(sizeof(*tags) * (num_tags + 1)); + if (!tags) + return REG_ESPACE; + tags[0] = -1; + assertions = 0; + params = NULL; + if (params_seen) + { + params = tre_mem_alloc(mem, sizeof(*params) + * TRE_PARAM_LAST); + if (!params) + { + xfree(tags); + return REG_ESPACE; + } + } + /* Second pass with tre_mach_empty() to get the list of + tags and parameters. */ + status = tre_match_empty(stack, cat->right, tags, + &assertions, params, NULL, NULL); + if (status != REG_OK) + { + xfree(tags); + return status; + } + node->lastpos = + tre_set_union(mem, cat->left->lastpos, cat->right->lastpos, + tags, assertions, params); + xfree(tags); + if (!node->lastpos) + return REG_ESPACE; + } + else + { + node->lastpos = cat->right->lastpos; + } + break; + } + + default: + assert(0); + break; + } + } + + return REG_OK; +} + + +/* Adds a transition from each position in `p1' to each position in `p2'. */ +static reg_errcode_t +tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, + tre_tnfa_transition_t *transitions, + int *counts, int *offs) +{ + tre_pos_and_tags_t *orig_p2 = p2; + tre_tnfa_transition_t *trans; + int i, j, k, l, dup, prev_p2_pos; + + if (transitions != NULL) + while (p1->position >= 0) + { + p2 = orig_p2; + prev_p2_pos = -1; + while (p2->position >= 0) + { + /* Optimization: if this position was already handled, skip it. */ + if (p2->position == prev_p2_pos) + { + p2++; + continue; + } + prev_p2_pos = p2->position; + /* Set `trans' to point to the next unused transition from + position `p1->position'. */ + trans = transitions + offs[p1->position]; + while (trans->state != NULL) + { +#if 0 + /* If we find a previous transition from `p1->position' to + `p2->position', it is overwritten. This can happen only + if there are nested loops in the regexp, like in "((a)*)*". + In POSIX.2 repetition using the outer loop is always + preferred over using the inner loop. Therefore the + transition for the inner loop is useless and can be thrown + away. */ + /* XXX - The same position is used for all nodes in a bracket + expression, so this optimization cannot be used (it will + break bracket expressions) unless I figure out a way to + detect it here. */ + if (trans->state_id == p2->position) + { + DPRINT(("*")); + break; + } +#endif + trans++; + } + + if (trans->state == NULL) + (trans + 1)->state = NULL; + /* Use the character ranges, assertions, etc. from `p1' for + the transition from `p1' to `p2'. */ + trans->code_min = (tre_cint_t) p1->code_min; + trans->code_max = (tre_cint_t) p1->code_max; + trans->state = transitions + offs[p2->position]; + trans->state_id = p2->position; + trans->assertions = p1->assertions | p2->assertions + | (p1->class ? ASSERT_CHAR_CLASS : 0) + | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0); + if (p1->backref >= 0) + { + assert((trans->assertions & ASSERT_CHAR_CLASS) == 0); + assert(p2->backref < 0); + trans->u.backref = p1->backref; + trans->assertions |= ASSERT_BACKREF; + } + else + trans->u.class = p1->class; + if (p1->neg_classes != NULL) + { + for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++); + trans->neg_classes = + xmalloc(sizeof(*trans->neg_classes) * (i + 1)); + if (trans->neg_classes == NULL) + return REG_ESPACE; + for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++) + trans->neg_classes[i] = p1->neg_classes[i]; + trans->neg_classes[i] = (tre_ctype_t)0; + } + else + trans->neg_classes = NULL; + + /* Find out how many tags this transition has. */ + i = 0; + if (p1->tags != NULL) + while(p1->tags[i] >= 0) + i++; + j = 0; + if (p2->tags != NULL) + while(p2->tags[j] >= 0) + j++; + + /* If we are overwriting a transition, free the old tag array. */ + if (trans->tags != NULL) + xfree(trans->tags); + trans->tags = NULL; + + /* If there were any tags, allocate an array and fill it. */ + if (i + j > 0) + { + trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1)); + if (!trans->tags) + return REG_ESPACE; + i = 0; + if (p1->tags != NULL) + while(p1->tags[i] >= 0) + { + trans->tags[i] = p1->tags[i]; + i++; + } + l = i; + j = 0; + if (p2->tags != NULL) + while (p2->tags[j] >= 0) + { + /* Don't add duplicates. */ + dup = 0; + for (k = 0; k < i; k++) + if (trans->tags[k] == p2->tags[j]) + { + dup = 1; + break; + } + if (!dup) + trans->tags[l++] = p2->tags[j]; + j++; + } + trans->tags[l] = -1; + } + + /* Set the parameter array. If both `p2' and `p1' have same + parameters, the values in `p2' override those in `p1'. */ + if (p1->params || p2->params) + { + if (!trans->params) + trans->params = xmalloc(sizeof(*trans->params) + * TRE_PARAM_LAST); + if (!trans->params) + return REG_ESPACE; + for (i = 0; i < TRE_PARAM_LAST; i++) + { + trans->params[i] = TRE_PARAM_UNSET; + if (p1->params && p1->params[i] != TRE_PARAM_UNSET) + trans->params[i] = p1->params[i]; + if (p2->params && p2->params[i] != TRE_PARAM_UNSET) + trans->params[i] = p2->params[i]; + } + } + else + { + if (trans->params) + xfree(trans->params); + trans->params = NULL; + } + + +#ifdef TRE_DEBUG + { + int *tags; + + DPRINT((" %2d -> %2d on %3d", p1->position, p2->position, + p1->code_min)); + if (p1->code_max != p1->code_min) + DPRINT(("-%3d", p1->code_max)); + tags = trans->tags; + if (tags) + { + DPRINT((", tags [")); + while (*tags >= 0) + { + DPRINT(("%d", *tags)); + tags++; + if (*tags >= 0) + DPRINT((",")); + } + DPRINT(("]")); + } + if (trans->assertions) + DPRINT((", assert %d", trans->assertions)); + if (trans->assertions & ASSERT_BACKREF) + DPRINT((", backref %d", trans->u.backref)); + else if (trans->u.class) + DPRINT((", class %ld", (long)trans->u.class)); + if (trans->neg_classes) + DPRINT((", neg_classes %p", trans->neg_classes)); + if (trans->params) + { + DPRINT((", ")); + tre_print_params(trans->params); + } + DPRINT(("\n")); + } +#endif /* TRE_DEBUG */ + p2++; + } + p1++; + } + else + /* Compute a maximum limit for the number of transitions leaving + from each state. */ + while (p1->position >= 0) + { + p2 = orig_p2; + while (p2->position >= 0) + { + counts[p1->position]++; + p2++; + } + p1++; + } + return REG_OK; +} + +/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are + labelled with one character range (there are no transitions on empty + strings). The TNFA takes O(n^2) space in the worst case, `n' is size of + the regexp. */ +static reg_errcode_t +tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, + int *counts, int *offs) +{ + tre_union_t *uni; + tre_catenation_t *cat; + tre_iteration_t *iter; + reg_errcode_t errcode = REG_OK; + + /* XXX - recurse using a stack!. */ + switch (node->type) + { + case LITERAL: + break; + case UNION: + uni = (tre_union_t *)node->obj; + errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs); + break; + + case CATENATION: + cat = (tre_catenation_t *)node->obj; + /* Add a transition from each position in cat->left->lastpos + to each position in cat->right->firstpos. */ + errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos, + transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs); + break; + + case ITERATION: + iter = (tre_iteration_t *)node->obj; + assert(iter->max == -1 || iter->max == 1); + + if (iter->max == -1) + { + assert(iter->min == 0 || iter->min == 1); + /* Add a transition from each last position in the iterated + expression to each first position. */ + errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos, + transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + } + errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs); + break; + } + return errcode; +} + +#define ERROR_EXIT(err) \ + do \ + { \ + errcode = err; \ + if (/*CONSTCOND*/(void)1,1) \ + goto error_exit; \ + } \ + while (/*CONSTCOND*/(void)0,0) + + +int +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) +{ + tre_stack_t *stack; + tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; + tre_pos_and_tags_t *p; + int *counts = NULL, *offs = NULL; + int i, add = 0; + tre_tnfa_transition_t *transitions, *initial; + tre_tnfa_t *tnfa = NULL; + tre_submatch_data_t *submatch_data; + tre_tag_direction_t *tag_directions = NULL; + reg_errcode_t errcode; + tre_mem_t mem; + int numpos = 0; + + /* Parse context. */ + tre_parse_ctx_t parse_ctx; + + /* Allocate a stack used throughout the compilation process for various + purposes. */ + stack = tre_stack_new(512, TRE_MAX_STACK); + if (!stack) + return REG_ESPACE; + /* Allocate a fast memory allocator. */ + mem = tre_mem_new(); + if (!mem) + { + tre_stack_destroy(stack); + return REG_ESPACE; + } + + /* Parse the regexp. */ + memset(&parse_ctx, 0, sizeof(parse_ctx)); + parse_ctx.mem = mem; + parse_ctx.stack = stack; + parse_ctx.re = regex; + parse_ctx.len = n; + parse_ctx.cflags = cflags; + parse_ctx.max_backref = -1; + /* Use 8-bit optimizations in 8-bit mode */ + parse_ctx.mb_cur_max = (cflags & REG_USEBYTES) ? 1 : TRE_MB_CUR_MAX; + DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex)); + errcode = tre_parse(&parse_ctx); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + preg->re_nsub = parse_ctx.submatch_id - 1; + tree = parse_ctx.result; + + /* Back references and approximate matching cannot currently be used + in the same regexp. */ + if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx) + ERROR_EXIT(REG_BADPAT); + +#ifdef TRE_DEBUG + tre_ast_print(tree); +#endif /* TRE_DEBUG */ + + /* Referring to nonexistent subexpressions is illegal. */ + if (parse_ctx.max_backref > (int)preg->re_nsub) + ERROR_EXIT(REG_ESUBREG); + + /* Allocate the TNFA struct. */ + tnfa = xcalloc(1, sizeof(tre_tnfa_t)); + if (tnfa == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->have_backrefs = parse_ctx.max_backref >= 0; + tnfa->have_approx = parse_ctx.have_approx; + tnfa->num_submatches = parse_ctx.submatch_id; + + /* The literal optimizer only looks at the final tree plus the outer + * compile flags. If the regexp changes flags inline with (?i:...) or + * (?-i:...), those scopes are no longer explicit in the optimized form, + * so keep using the full matcher. */ + if (!parse_ctx.have_inline_cflags) + { + errcode = tre_litopt_try_compile(tnfa, tree, cflags, + parse_ctx.mb_cur_max); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + } + + /* Set up tags for submatch addressing. If REG_NOSUB is set and the + regexp does not have back references, this can be skipped. */ + if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) + { + DPRINT(("tre_compile: setting up tags\n")); + + /* Figure out how many tags we will need. */ + errcode = tre_add_tags(NULL, stack, tree, tnfa); + if (errcode != REG_OK) + ERROR_EXIT(errcode); +#ifdef TRE_DEBUG + tre_ast_print(tree); +#endif /* TRE_DEBUG */ + + if (tnfa->num_tags > 0) + { + tag_directions = xmalloc(sizeof(*tag_directions) + * (tnfa->num_tags + 1)); + if (tag_directions == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->tag_directions = tag_directions; + memset(tag_directions, -1, + sizeof(*tag_directions) * (tnfa->num_tags + 1)); + } + tnfa->minimal_tags = xcalloc(tnfa->num_tags * 2 + 1, + sizeof(*tnfa->minimal_tags)); + if (tnfa->minimal_tags == NULL) + ERROR_EXIT(REG_ESPACE); + + submatch_data = xcalloc((unsigned)parse_ctx.submatch_id, + sizeof(*submatch_data)); + if (submatch_data == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->submatch_data = submatch_data; + + errcode = tre_add_tags(mem, stack, tree, tnfa); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + +#ifdef TRE_DEBUG + for (i = 0; i < parse_ctx.submatch_id; i++) + DPRINT(("pmatch[%d] = {t%d, t%d}\n", + i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); + for (i = 0; i < tnfa->num_tags; i++) + DPRINT(("t%d is %s\n", i, + tag_directions[i] == TRE_TAG_MINIMIZE ? + "minimized" : "maximized")); +#endif /* TRE_DEBUG */ + } + + /* Expand iteration nodes. */ + errcode = tre_expand_ast(mem, stack, tree, tag_directions, + &tnfa->params_depth); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + + /* Add a dummy node for the final state. + XXX - For certain patterns this dummy node can be optimized away, + for example "a*" or "ab*". Figure out a simple way to detect + this possibility. */ + tmp_ast_l = tree; + tmp_ast_r = tre_ast_new_literal(mem, 0, 0); + if (tmp_ast_r == NULL) + ERROR_EXIT(REG_ESPACE); + + tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); + if (tree == NULL) + ERROR_EXIT(REG_ESPACE); + + errcode = tre_compute_npfl(mem, stack, tree, &numpos); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + +#ifdef TRE_DEBUG + tre_ast_print(tree); + DPRINT(("Number of states: %d\n", numpos)); +#endif /* TRE_DEBUG */ + + counts = xmalloc(sizeof(int) * numpos); + if (counts == NULL) + ERROR_EXIT(REG_ESPACE); + + offs = xmalloc(sizeof(int) * numpos); + if (offs == NULL) + ERROR_EXIT(REG_ESPACE); + + for (i = 0; i < numpos; i++) + counts[i] = 0; + tre_ast_to_tnfa(tree, NULL, counts, NULL); + + add = 0; + for (i = 0; i < numpos; i++) + { + offs[i] = add; + add += counts[i] + 1; + counts[i] = 0; + } + transitions = xcalloc((unsigned)add + 1, sizeof(*transitions)); + if (transitions == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->transitions = transitions; + tnfa->num_transitions = add; + + DPRINT(("Converting to TNFA:\n")); + errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + + /* If in eight bit mode, compute a table of characters that can be the + first character of a match. */ + tnfa->first_char = -1; + if (parse_ctx.mb_cur_max == 1 && !tmp_ast_l->nullable) + { + int count = 0; + tre_cint_t k; + DPRINT(("Characters that can start a match:")); + tnfa->firstpos_chars = xcalloc(256, sizeof(char)); + if (tnfa->firstpos_chars == NULL) + ERROR_EXIT(REG_ESPACE); + for (p = tree->firstpos; p->position >= 0; p++) + { + tre_tnfa_transition_t *j = transitions + offs[p->position]; + while (j->state != NULL) + { + for (k = j->code_min; k <= j->code_max && k < 256; k++) + { + DPRINT((" %d", k)); + tnfa->firstpos_chars[k] = 1; + count++; + } + j++; + } + } + DPRINT(("\n")); +#define TRE_OPTIMIZE_FIRST_CHAR 1 +#if TRE_OPTIMIZE_FIRST_CHAR + if (count == 1) + { + for (k = 0; k < 256; k++) + if (tnfa->firstpos_chars[k]) + { + DPRINT(("first char must be %d\n", k)); + tnfa->first_char = k; + xfree(tnfa->firstpos_chars); + tnfa->firstpos_chars = NULL; + break; + } + } +#endif + + } + else + tnfa->firstpos_chars = NULL; + + + p = tree->firstpos; + i = 0; + while (p->position >= 0) + { + i++; + +#ifdef TRE_DEBUG + { + int *tags; + DPRINT(("initial: %d", p->position)); + tags = p->tags; + if (tags != NULL) + { + if (*tags >= 0) + DPRINT(("/")); + while (*tags >= 0) + { + DPRINT(("%d", *tags)); + tags++; + if (*tags >= 0) + DPRINT((",")); + } + } + DPRINT((", assert %d", p->assertions)); + if (p->params) + { + DPRINT((", ")); + tre_print_params(p->params); + } + DPRINT(("\n")); + } +#endif /* TRE_DEBUG */ + + p++; + } + + initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t)); + if (initial == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->initial = initial; + + i = 0; + for (p = tree->firstpos; p->position >= 0; p++) + { + initial[i].state = transitions + offs[p->position]; + initial[i].state_id = p->position; + initial[i].tags = NULL; + /* Copy the arrays p->tags, and p->params, they are allocated + from a tre_mem object. */ + if (p->tags) + { + int j; + for (j = 0; p->tags[j] >= 0; j++); + initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1)); + if (!initial[i].tags) + ERROR_EXIT(REG_ESPACE); + memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); + } + initial[i].params = NULL; + if (p->params) + { + initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST); + if (!initial[i].params) + ERROR_EXIT(REG_ESPACE); + memcpy(initial[i].params, p->params, + sizeof(*p->params) * TRE_PARAM_LAST); + } + initial[i].assertions = p->assertions; + i++; + } + initial[i].state = NULL; + + tnfa->num_transitions = add; + tnfa->final = transitions + offs[tree->lastpos[0].position]; + tnfa->num_states = numpos; + tnfa->cflags = cflags; + + DPRINT(("final state %p\n", (void *)tnfa->final)); + + tre_mem_destroy(mem); + tre_stack_destroy(stack); + xfree(counts); + xfree(offs); + + preg->TRE_REGEX_T_FIELD = (void *)tnfa; + return REG_OK; + + error_exit: + /* Free everything that was allocated and return the error code. */ + tre_mem_destroy(mem); + if (stack != NULL) + tre_stack_destroy(stack); + if (counts != NULL) + xfree(counts); + if (offs != NULL) + xfree(offs); + preg->TRE_REGEX_T_FIELD = (void *)tnfa; + tre_free(preg); + return errcode; +} + + + + +void +tre_free(regex_t *preg) +{ + tre_tnfa_t *tnfa; + unsigned int i; + tre_tnfa_transition_t *trans; + + tnfa = (void *)preg->TRE_REGEX_T_FIELD; + if (!tnfa) + return; + + for (i = 0; i < tnfa->num_transitions; i++) + if (tnfa->transitions[i].state) + { + if (tnfa->transitions[i].tags) + xfree(tnfa->transitions[i].tags); + if (tnfa->transitions[i].neg_classes) + xfree(tnfa->transitions[i].neg_classes); + if (tnfa->transitions[i].params) + xfree(tnfa->transitions[i].params); + } + if (tnfa->transitions) + xfree(tnfa->transitions); + + if (tnfa->initial) + { + for (trans = tnfa->initial; trans->state; trans++) + { + if (trans->tags) + xfree(trans->tags); + if (trans->params) + xfree(trans->params); + } + xfree(tnfa->initial); + } + + if (tnfa->submatch_data) + { + for (i = 0; i < tnfa->num_submatches; i++) + if (tnfa->submatch_data[i].parents) + xfree(tnfa->submatch_data[i].parents); + xfree(tnfa->submatch_data); + } + + if (tnfa->tag_directions) + xfree(tnfa->tag_directions); + if (tnfa->firstpos_chars) + xfree(tnfa->firstpos_chars); + if (tnfa->minimal_tags) + xfree(tnfa->minimal_tags); + tre_litopt_free_literal_list(tnfa->literal_opt.literals, + tnfa->literal_opt.num_literals); + xfree(tnfa); +} + +char * +tre_version(void) +{ + static char str[256]; + char *version; + + if (str[0] == 0) + { + (void) tre_config(TRE_CONFIG_VERSION, &version); + (void) snprintf(str, sizeof(str), "TRE %s (BSD)", version); + } + return str; +} + +int +tre_config(int query, void *result) +{ + int *int_result = result; + const char **string_result = result; + + switch (query) + { + case TRE_CONFIG_APPROX: +#ifdef TRE_APPROX + *int_result = 1; +#else /* !TRE_APPROX */ + *int_result = 0; +#endif /* !TRE_APPROX */ + return REG_OK; + + case TRE_CONFIG_WCHAR: +#ifdef TRE_WCHAR + *int_result = 1; +#else /* !TRE_WCHAR */ + *int_result = 0; +#endif /* !TRE_WCHAR */ + return REG_OK; + + case TRE_CONFIG_MULTIBYTE: +#ifdef TRE_MULTIBYTE + *int_result = 1; +#else /* !TRE_MULTIBYTE */ + *int_result = 0; +#endif /* !TRE_MULTIBYTE */ + return REG_OK; + + case TRE_CONFIG_SYSTEM_ABI: +#ifdef TRE_CONFIG_SYSTEM_ABI + *int_result = 1; +#else /* !TRE_CONFIG_SYSTEM_ABI */ + *int_result = 0; +#endif /* !TRE_CONFIG_SYSTEM_ABI */ + return REG_OK; + + case TRE_CONFIG_VERSION: + *string_result = TRE_VERSION; + return REG_OK; + } + + return REG_NOMATCH; +} + + +/* EOF */ diff --git a/deps/tre/lib/tre-compile.h b/deps/tre/lib/tre-compile.h new file mode 100644 index 000000000..51d5ac94a --- /dev/null +++ b/deps/tre/lib/tre-compile.h @@ -0,0 +1,27 @@ +/* + tre-compile.h: Regex compilation definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + + +#ifndef TRE_COMPILE_H +#define TRE_COMPILE_H 1 + +typedef struct { + int position; + int code_min; + int code_max; + int *tags; + int assertions; + tre_ctype_t class; + tre_ctype_t *neg_classes; + int backref; + int *params; +} tre_pos_and_tags_t; + +#endif /* TRE_COMPILE_H */ + +/* EOF */ diff --git a/deps/tre/lib/tre-filter.c b/deps/tre/lib/tre-filter.c new file mode 100644 index 000000000..194e188ba --- /dev/null +++ b/deps/tre/lib/tre-filter.c @@ -0,0 +1,73 @@ +/* + tre-filter.c: Histogram filter to quickly find regexp match candidates + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* The idea of this filter is quite simple. First, let's assume the + search pattern is a simple string. In order for a substring of a + longer string to match the search pattern, it must have the same + numbers of different characters as the pattern, and those + characters must occur in the same order as they occur in pattern. */ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include "tre-internal.h" +#include "tre-filter.h" + +int +tre_filter_find(const unsigned char *str, size_t len, tre_filter_t *filter) +{ + unsigned short counts[256]; + unsigned int i; + unsigned int window_len = filter->window_len; + tre_filter_profile_t *profile = filter->profile; + const unsigned char *str_orig = str; + + DPRINT(("tre_filter_find: %.*s\n", len, str)); + + for (i = 0; i < elementsof(counts); i++) + counts[i] = 0; + + i = 0; + while (*str && i < window_len && i < len) + { + counts[*str]++; + i++; + str++; + len--; + } + + while (len > 0) + { + tre_filter_profile_t *p; + counts[*str]++; + counts[*(str - window_len)]--; + + p = profile; + while (p->ch) + { + if (counts[p->ch] < p->count) + break; + p++; + } + if (!p->ch) + { + DPRINT(("Found possible match at %d\n", + str - str_orig)); + return str - str_orig; + } + else + { + DPRINT(("No match so far...\n")); + } + len--; + str++; + } + DPRINT(("This string cannot match.\n")); + return -1; +} diff --git a/deps/tre/lib/tre-filter.h b/deps/tre/lib/tre-filter.h new file mode 100644 index 000000000..31d0b8263 --- /dev/null +++ b/deps/tre/lib/tre-filter.h @@ -0,0 +1,19 @@ + + + + +typedef struct { + unsigned char ch; + unsigned char count; +} tre_filter_profile_t; + +typedef struct { + /* Length of the window where the character counts are kept. */ + int window_len; + /* Required character counts table. */ + tre_filter_profile_t *profile; +} tre_filter_t; + + +int +tre_filter_find(const unsigned char *str, size_t len, tre_filter_t *filter); diff --git a/deps/tre/lib/tre-internal.h b/deps/tre/lib/tre-internal.h new file mode 100644 index 000000000..40081f0c0 --- /dev/null +++ b/deps/tre/lib/tre-internal.h @@ -0,0 +1,319 @@ +/* + tre-internal.h - TRE internal definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifndef TRE_INTERNAL_H +#define TRE_INTERNAL_H 1 + +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ + +#ifdef HAVE_WCTYPE_H +#include +#endif /* HAVE_WCTYPE_H */ + +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ + +#include +#include +#include "../local_includes/tre.h" + +#define TRE_MAX_RE 65536 +#define TRE_MAX_STRING INT_MAX +#define TRE_MAX_STACK 1048576 + +#ifdef TRE_DEBUG +#include +#define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/(void)0,0) +#else /* !TRE_DEBUG */ +#define DPRINT(msg) do { } while(/*CONSTCOND*/(void)0,0) +#endif /* !TRE_DEBUG */ + +#define elementsof(x) ( sizeof(x) / sizeof(x[0]) ) + +#ifdef HAVE_MBRTOWC +#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) +#else /* !HAVE_MBRTOWC */ +#ifdef HAVE_MBTOWC +#define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n))) +#endif /* HAVE_MBTOWC */ +#endif /* !HAVE_MBRTOWC */ + +#ifdef TRE_MULTIBYTE +#ifdef HAVE_MBSTATE_T +#define TRE_MBSTATE +#endif /* TRE_MULTIBYTE */ +#endif /* HAVE_MBSTATE_T */ + +/* Define the character types and functions. */ +#ifdef TRE_WCHAR + +/* Wide characters. */ +typedef wint_t tre_cint_t; +#if WCHAR_MAX <= INT_MAX +#define TRE_CHAR_MAX WCHAR_MAX +#else /* WCHAR_MAX > INT_MAX */ +#define TRE_CHAR_MAX INT_MAX +#endif + +#ifdef TRE_MULTIBYTE +#define TRE_MB_CUR_MAX MB_CUR_MAX +#else /* !TRE_MULTIBYTE */ +#define TRE_MB_CUR_MAX 1 +#endif /* !TRE_MULTIBYTE */ + +#define tre_isalnum iswalnum +#define tre_isalpha iswalpha +#ifdef HAVE_ISWBLANK +#define tre_isblank iswblank +#endif /* HAVE_ISWBLANK */ +#define tre_iscntrl iswcntrl +#define tre_isdigit iswdigit +#define tre_isgraph iswgraph +#define tre_islower iswlower +#define tre_isprint iswprint +#define tre_ispunct iswpunct +#define tre_isspace iswspace +#define tre_isupper iswupper +#define tre_isxdigit iswxdigit + +#define tre_tolower towlower +#define tre_toupper towupper +#define tre_strlen wcslen + +#else /* !TRE_WCHAR */ + +/* 8 bit characters. */ +typedef short tre_cint_t; +#define TRE_CHAR_MAX 255 +#define TRE_MB_CUR_MAX 1 + +#define tre_isalnum isalnum +#define tre_isalpha isalpha +#ifdef HAVE_ISASCII +#define tre_isascii isascii +#endif /* HAVE_ISASCII */ +#ifdef HAVE_ISBLANK +#define tre_isblank isblank +#endif /* HAVE_ISBLANK */ +#define tre_iscntrl iscntrl +#define tre_isdigit isdigit +#define tre_isgraph isgraph +#define tre_islower islower +#define tre_isprint isprint +#define tre_ispunct ispunct +#define tre_isspace isspace +#define tre_isupper isupper +#define tre_isxdigit isxdigit + +#define tre_tolower(c) (tre_cint_t)(tolower(c)) +#define tre_toupper(c) (tre_cint_t)(toupper(c)) +#define tre_strlen(s) (strlen((const char*)s)) + +#endif /* !TRE_WCHAR */ + +#if defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE) +#define TRE_USE_SYSTEM_WCTYPE 1 +#endif + +#ifdef TRE_USE_SYSTEM_WCTYPE +/* Use system provided iswctype() and wctype(). */ +typedef wctype_t tre_ctype_t; +#define tre_isctype iswctype +#define tre_ctype wctype +#else /* !TRE_USE_SYSTEM_WCTYPE */ +/* Define our own versions of iswctype() and wctype(). */ +typedef int (*tre_ctype_t)(tre_cint_t); +#define tre_isctype(c, type) ( (type)(c) ) +tre_ctype_t tre_ctype(const char *name); +#endif /* !TRE_USE_SYSTEM_WCTYPE */ + +typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; + +/* Returns number of bytes to add to (char *)ptr to make it + properly aligned for the type. */ +#define ALIGN(ptr, type) \ + ((((long)ptr) % sizeof(type)) \ + ? (sizeof(type) - (((long)ptr) % sizeof(type))) \ + : 0) + +#undef MAX +#undef MIN +#define MAX(a, b) (((a) >= (b)) ? (a) : (b)) +#define MIN(a, b) (((a) <= (b)) ? (a) : (b)) + +/* Define STRF to the correct printf formatter for strings. */ +#ifdef TRE_WCHAR +#define STRF "ls" +#else /* !TRE_WCHAR */ +#define STRF "s" +#endif /* !TRE_WCHAR */ + +/* TNFA transition type. A TNFA state is an array of transitions, + the terminator is a transition with NULL `state'. */ +typedef struct tnfa_transition tre_tnfa_transition_t; + +struct tnfa_transition { + /* Range of accepted characters. */ + tre_cint_t code_min; + tre_cint_t code_max; + /* Pointer to the destination state. */ + tre_tnfa_transition_t *state; + /* ID number of the destination state. */ + int state_id; + /* -1 terminated array of tags (or NULL). */ + int *tags; + /* Matching parameters settings (or NULL). */ + int *params; + /* Assertion bitmap. */ + int assertions; + /* Assertion parameters. */ + union { + /* Character class assertion. */ + tre_ctype_t class; + /* Back reference assertion. */ + int backref; + } u; + /* Negative character class assertions. */ + tre_ctype_t *neg_classes; +}; + + +/* Assertions. */ +#define ASSERT_AT_BOL 1 /* Beginning of line. */ +#define ASSERT_AT_EOL 2 /* End of line. */ +#define ASSERT_CHAR_CLASS 4 /* Character class in `class'. */ +#define ASSERT_CHAR_CLASS_NEG 8 /* Character classes in `neg_classes'. */ +#define ASSERT_AT_BOW 16 /* Beginning of word. */ +#define ASSERT_AT_EOW 32 /* End of word. */ +#define ASSERT_AT_WB 64 /* Word boundary. */ +#define ASSERT_AT_WB_NEG 128 /* Not a word boundary. */ +#define ASSERT_BACKREF 256 /* A back reference in `backref'. */ +#define ASSERT_LAST 256 + +/* Tag directions. */ +typedef enum { + TRE_TAG_MINIMIZE = 0, + TRE_TAG_MAXIMIZE = 1 +} tre_tag_direction_t; + +/* Parameters that can be changed dynamically while matching. */ +typedef enum { + TRE_PARAM_COST_INS = 0, + TRE_PARAM_COST_DEL = 1, + TRE_PARAM_COST_SUBST = 2, + TRE_PARAM_COST_MAX = 3, + TRE_PARAM_MAX_INS = 4, + TRE_PARAM_MAX_DEL = 5, + TRE_PARAM_MAX_SUBST = 6, + TRE_PARAM_MAX_ERR = 7, + TRE_PARAM_DEPTH = 8, + TRE_PARAM_LAST = 9 +} tre_param_t; + +/* Unset matching parameter */ +#define TRE_PARAM_UNSET -1 + +/* Signifies the default matching parameter value. */ +#define TRE_PARAM_DEFAULT -2 + +/* Instructions to compute submatch register values from tag values + after a successful match. */ +struct tre_submatch_data { + /* Tag that gives the value for rm_so (submatch start offset). */ + int so_tag; + /* Tag that gives the value for rm_eo (submatch end offset). */ + int eo_tag; + /* List of submatches this submatch is contained in. */ + int *parents; +}; + +typedef struct tre_submatch_data tre_submatch_data_t; + +typedef enum { + TRE_LITERAL_OPT_NONE = 0, + TRE_LITERAL_OPT_CONTAINS, + TRE_LITERAL_OPT_PREFIX, + TRE_LITERAL_OPT_SUFFIX, + TRE_LITERAL_OPT_EXACT +} tre_literal_opt_mode_t; + +typedef struct { + unsigned char *data; + size_t len; +} tre_literal_opt_literal_t; + +typedef struct { + tre_literal_opt_mode_t mode; + int nocase; + size_t num_literals; + /* Folded byte mapping used by the nocase fast path. */ + unsigned char fold_map[256]; + /* Literal index ranges grouped by the first literal byte. */ + size_t start_offsets[257]; + tre_literal_opt_literal_t *literals; +} tre_literal_opt_t; + + +/* TNFA definition. */ +typedef struct tnfa tre_tnfa_t; + +struct tnfa { + tre_tnfa_transition_t *transitions; + unsigned int num_transitions; + tre_tnfa_transition_t *initial; + tre_tnfa_transition_t *final; + tre_submatch_data_t *submatch_data; + char *firstpos_chars; + int first_char; + unsigned int num_submatches; + tre_tag_direction_t *tag_directions; + int *minimal_tags; + int num_tags; + int num_minimals; + int end_tag; + int num_states; + int cflags; + int have_backrefs; + int have_approx; + int params_depth; + tre_literal_opt_t literal_opt; +}; + +int +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags); + +void +tre_free(regex_t *preg); + +void +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, + const tre_tnfa_t *tnfa, int *tags, int match_eo); + +reg_errcode_t +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, ssize_t len, + tre_str_type_t type, int *match_tags, int eflags, + int *match_end_ofs); + +reg_errcode_t +tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, ssize_t len, + tre_str_type_t type, int *match_tags, int eflags, + int *match_end_ofs); + +#ifdef TRE_APPROX +reg_errcode_t +tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, ssize_t len, + tre_str_type_t type, int *match_tags, regamatch_t *match, + regaparams_t params, int eflags, int *match_end_ofs); +#endif /* TRE_APPROX */ + +#endif /* TRE_INTERNAL_H */ + +/* EOF */ diff --git a/deps/tre/lib/tre-match-backtrack.c b/deps/tre/lib/tre-match-backtrack.c new file mode 100644 index 000000000..7e184929e --- /dev/null +++ b/deps/tre/lib/tre-match-backtrack.c @@ -0,0 +1,676 @@ +/* + tre-match-backtrack.c - TRE backtracking regex matching engine + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + This matcher is for regexps that use back referencing. Regexp matching + with back referencing is an NP-complete problem on the number of back + references. The easiest way to match them is to use a backtracking + routine which basically goes through all possible paths in the TNFA + and chooses the one which results in the best (leftmost and longest) + match. This can be spectacularly expensive and may run out of stack + space, but there really is no better known generic algorithm. Quoting + Henry Spencer from comp.compilers: + + + POSIX.2 REs require longest match, which is really exciting to + implement since the obsolete ("basic") variant also includes + \. I haven't found a better way of tackling this than doing + a preliminary match using a DFA (or simulation) on a modified RE + that just replicates subREs for \, and then doing a + backtracking match to determine whether the subRE matches were + right. This can be rather slow, but I console myself with the + thought that people who use \ deserve very slow execution. + (Pun unintentional but very appropriate.) + +*/ + + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#ifdef TRE_USE_ALLOCA +/* AIX requires this to be the first thing in the file. */ +#ifndef __GNUC__ +# if HAVE_ALLOCA_H +# include +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +char *alloca (); +# endif +# endif +# endif +#endif +#endif /* TRE_USE_ALLOCA */ + +#include +#include +#include +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ +#ifdef HAVE_WCTYPE_H +#include +#endif /* HAVE_WCTYPE_H */ +#ifndef TRE_WCHAR +#include +#endif /* !TRE_WCHAR */ +#ifdef HAVE_MALLOC_H +#include +#endif /* HAVE_MALLOC_H */ + +#include "tre-internal.h" +#include "tre-mem.h" +#include "tre-match-utils.h" +#include "xmalloc.h" + +typedef struct { + int pos; + const char *str_byte; +#ifdef TRE_WCHAR + const wchar_t *str_wide; +#endif /* TRE_WCHAR */ + tre_tnfa_transition_t *state; + int state_id; + int next_c; + int *tags; +#ifdef TRE_MBSTATE + mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +} tre_backtrack_item_t; + +typedef struct tre_backtrack_struct { + tre_backtrack_item_t item; + struct tre_backtrack_struct *prev; + struct tre_backtrack_struct *next; +} *tre_backtrack_t; + +#ifdef TRE_WCHAR +#define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide) +#define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide +#else /* !TRE_WCHAR */ +#define BT_STACK_WIDE_IN(_str_wide) +#define BT_STACK_WIDE_OUT +#endif /* !TRE_WCHAR */ + +#ifdef TRE_MBSTATE +#define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) +#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate +#else /* !TRE_MBSTATE */ +#define BT_STACK_MBSTATE_IN +#define BT_STACK_MBSTATE_OUT +#endif /* !TRE_MBSTATE */ + + +#ifdef TRE_USE_ALLOCA +#define tre_bt_mem_new tre_mem_newa +#define tre_bt_mem_alloc tre_mem_alloca +#define tre_bt_mem_destroy(obj) do { } while (0) +#define xafree(obj) do { } while (0) /* do nothing, obj was obtained with alloca() */ +#else /* !TRE_USE_ALLOCA */ +#define tre_bt_mem_new tre_mem_new +#define tre_bt_mem_alloc tre_mem_alloc +#define tre_bt_mem_destroy tre_mem_destroy +#define xafree(obj) xfree(obj) +#endif /* !TRE_USE_ALLOCA */ + + +#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ + do \ + { \ + int i; \ + if (!stack->next) \ + { \ + tre_backtrack_t s; \ + s = tre_bt_mem_alloc(mem, sizeof(*s)); \ + if (!s) \ + { \ + tre_bt_mem_destroy(mem); \ + if (tags) \ + xafree(tags); \ + if (pmatch) \ + xafree(pmatch); \ + if (states_seen) \ + xafree(states_seen); \ + return REG_ESPACE; \ + } \ + s->prev = stack; \ + s->next = NULL; \ + s->item.tags = tre_bt_mem_alloc(mem, \ + sizeof(*tags) * tnfa->num_tags); \ + if (!s->item.tags) \ + { \ + tre_bt_mem_destroy(mem); \ + if (tags) \ + xafree(tags); \ + if (pmatch) \ + xafree(pmatch); \ + if (states_seen) \ + xafree(states_seen); \ + return REG_ESPACE; \ + } \ + stack->next = s; \ + stack = s; \ + } \ + else \ + stack = stack->next; \ + stack->item.pos = (_pos); \ + stack->item.str_byte = (_str_byte); \ + BT_STACK_WIDE_IN(_str_wide); \ + stack->item.state = (_state); \ + stack->item.state_id = (_state_id); \ + stack->item.next_c = (_next_c); \ + for (i = 0; i < tnfa->num_tags; i++) \ + stack->item.tags[i] = (_tags)[i]; \ + BT_STACK_MBSTATE_IN; \ + } \ + while (/*CONSTCOND*/(void)0,0) + +#define BT_STACK_POP() \ + do \ + { \ + int i; \ + assert(stack->prev); \ + pos = stack->item.pos; \ + if (type == STR_USER) \ + str_source->rewind(pos + pos_add_next, str_source->context); \ + str_byte = stack->item.str_byte; \ + BT_STACK_WIDE_OUT; \ + state = stack->item.state; \ + next_c = (tre_char_t) stack->item.next_c; \ + for (i = 0; i < tnfa->num_tags; i++) \ + tags[i] = stack->item.tags[i]; \ + BT_STACK_MBSTATE_OUT; \ + stack = stack->prev; \ + } \ + while (/*CONSTCOND*/(void)0,0) + +#undef MIN +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +reg_errcode_t +tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, + ssize_t len, tre_str_type_t type, int *match_tags, + int eflags, int *match_end_ofs) +{ + /* State variables required by GET_NEXT_WCHAR. */ + tre_char_t prev_c = 0, next_c = 0; + const char *str_byte = string; + ssize_t pos = 0; + unsigned int pos_add_next = 1; +#ifdef TRE_WCHAR + const wchar_t *str_wide = string; +#ifdef TRE_MBSTATE + mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +#endif /* TRE_WCHAR */ + int reg_notbol = eflags & REG_NOTBOL; + int reg_noteol = eflags & REG_NOTEOL; + int reg_newline = tnfa->cflags & REG_NEWLINE; + int str_user_end = 0; + + /* These are used to remember the necessary values of the above + variables to return to the position where the current search + started from. */ + int next_c_start; + const char *str_byte_start; + int pos_start = -1; +#ifdef TRE_WCHAR + const wchar_t *str_wide_start; +#endif /* TRE_WCHAR */ +#ifdef TRE_MBSTATE + mbstate_t mbstate_start; +#endif /* TRE_MBSTATE */ + reg_errcode_t ret; + + /* End offset of best match so far, or -1 if no match found yet. */ + int match_eo = -1; + /* Tag arrays. */ + int *next_tags, *tags = NULL; + /* Current TNFA state. */ + tre_tnfa_transition_t *state; + int *states_seen = NULL; + + /* Memory allocator to for allocating the backtracking stack. */ + tre_mem_t mem = tre_bt_mem_new(); + + /* The backtracking stack. */ + tre_backtrack_t stack; + + tre_tnfa_transition_t *trans_i; + regmatch_t *pmatch = NULL; + + /* + * TRE internals tend to use int instead of size_t for positions or + * lengths and don't check for overflow. This will take time to fix + * properly. In the meantime, simply limit the input to what we can + * handle. + */ + if (len > TRE_MAX_STRING) + len = TRE_MAX_STRING; + +#ifdef TRE_MBSTATE + memset(&mbstate, '\0', sizeof(mbstate)); +#endif /* TRE_MBSTATE */ + + if (!mem) + return REG_ESPACE; + stack = tre_bt_mem_alloc(mem, sizeof(*stack)); + if (!stack) + { + ret = REG_ESPACE; + goto error_exit; + } + stack->prev = NULL; + stack->next = NULL; + + DPRINT(("tnfa_execute_backtrack, input type %d\n", type)); + DPRINT(("len = %zd\n", len)); + +#ifdef TRE_USE_ALLOCA + tags = alloca(sizeof(*tags) * tnfa->num_tags); + pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches); + states_seen = alloca(sizeof(*states_seen) * tnfa->num_states); +#else /* !TRE_USE_ALLOCA */ + if (tnfa->num_tags) + { + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); + if (!tags) + { + ret = REG_ESPACE; + goto error_exit; + } + } + if (tnfa->num_submatches) + { + pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); + if (!pmatch) + { + ret = REG_ESPACE; + goto error_exit; + } + } + if (tnfa->num_states) + { + states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); + if (!states_seen) + { + ret = REG_ESPACE; + goto error_exit; + } + } +#endif /* !TRE_USE_ALLOCA */ + + retry: + { + int i; + for (i = 0; i < tnfa->num_tags; i++) + { + tags[i] = -1; + if (match_tags) + match_tags[i] = -1; + } + for (i = 0; i < tnfa->num_states; i++) + states_seen[i] = 0; + } + + state = NULL; + pos = pos_start; + if (type == STR_USER) + str_source->rewind(pos + pos_add_next, str_source->context); + GET_NEXT_WCHAR(); + pos_start = pos; + next_c_start = next_c; + str_byte_start = str_byte; +#ifdef TRE_WCHAR + str_wide_start = str_wide; +#endif /* TRE_WCHAR */ +#ifdef TRE_MBSTATE + mbstate_start = mbstate; +#endif /* TRE_MBSTATE */ + + /* Handle initial states. */ + next_tags = NULL; + for (trans_i = tnfa->initial; trans_i->state; trans_i++) + { + DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); + if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) + { + DPRINT(("assert failed\n")); + continue; + } + if (state == NULL) + { + /* Start from this state. */ + state = trans_i->state; + next_tags = trans_i->tags; + } + else + { + /* Backtrack to this state. */ + DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); + BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + trans_i->state_id, next_c, tags, mbstate); + { + int *tmp = trans_i->tags; + if (tmp) + while (*tmp >= 0) + stack->item.tags[*tmp++] = pos; + } + } + } + + if (next_tags) + for (; *next_tags >= 0; next_tags++) + tags[*next_tags] = pos; + + + DPRINT(("entering match loop, pos %zd, str_byte %p\n", pos, str_byte)); + DPRINT(("pos:chr/code | state and tags\n")); + DPRINT(("-------------+------------------------------------------------\n")); + + if (state == NULL) + goto backtrack; + + while (/*CONSTCOND*/(void)1,1) + { + tre_tnfa_transition_t *next_state; + int empty_br_match; + + DPRINT(("start loop\n")); + if (state == tnfa->final) + { + DPRINT((" match found, %d %zd\n", match_eo, pos)); + if (match_eo < pos + || (match_eo == pos + && match_tags + && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, + tags, match_tags))) + { + int i; + /* This match wins the previous match. */ + DPRINT((" win previous\n")); + match_eo = pos; + if (match_tags) + for (i = 0; i < tnfa->num_tags; i++) + match_tags[i] = tags[i]; + } + /* Our TNFAs never have transitions leaving from the final state, + so we jump right to backtracking. */ + goto backtrack; + } + +#ifdef TRE_DEBUG + DPRINT(("%3zd:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, + state)); + { + int i; + for (i = 0; i < tnfa->num_tags; i++) + DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : "")); + DPRINT(("\n")); + } +#endif /* TRE_DEBUG */ + + /* Go to the next character in the input string. */ + empty_br_match = 0; + trans_i = state; + if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) + { + /* This is a back reference state. All transitions leaving from + this state have the same back reference "assertion". Instead + of reading the next character, we match the back reference. */ + int so, eo, bt = trans_i->u.backref; + int bt_len; + int result; + + DPRINT((" should match back reference %d\n", bt)); + /* Get the substring we need to match against. Remember to + turn off REG_NOSUB temporarily. */ + tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, + tnfa, tags, pos); + so = pmatch[bt].rm_so; + eo = pmatch[bt].rm_eo; + bt_len = eo - so; + +#ifdef TRE_DEBUG + { + int slen; + if (len < 0) + slen = bt_len; + else + slen = MIN(bt_len, len - pos); + + if (type == STR_BYTE) + { + DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n", + bt_len, so, eo, bt_len, (char*)string + so)); + DPRINT((" current string is '%.*s'\n", slen, str_byte - 1)); + } +#ifdef TRE_WCHAR + else if (type == STR_WIDE) + { + DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n", + bt_len, so, eo, bt_len, (wchar_t*)string + so)); + DPRINT((" current string is '%.*" STRF "'\n", + slen, str_wide - 1)); + } +#endif /* TRE_WCHAR */ + } +#endif + + if (len < 0) + { + if (type == STR_USER) + result = str_source->compare((unsigned)so, (unsigned)pos, + (unsigned)bt_len, + str_source->context); +#ifdef TRE_WCHAR + else if (type == STR_WIDE) + result = wcsncmp((const wchar_t*)string + so, str_wide - 1, + (size_t)bt_len); +#endif /* TRE_WCHAR */ + else + result = strncmp((const char*)string + so, str_byte - 1, + (size_t)bt_len); + } + else if (len - pos < bt_len) + result = 1; +#ifdef TRE_WCHAR + else if (type == STR_WIDE) + result = wmemcmp((const wchar_t*)string + so, str_wide - 1, + (size_t)bt_len); +#endif /* TRE_WCHAR */ + else + result = memcmp((const char*)string + so, str_byte - 1, + (size_t)bt_len); + + if (result == 0) + { + /* Back reference matched. Check for infinite loop. */ + if (bt_len == 0) + empty_br_match = 1; + if (empty_br_match && states_seen[trans_i->state_id]) + { + DPRINT((" avoid loop\n")); + goto backtrack; + } + + states_seen[trans_i->state_id] = empty_br_match; + + /* Advance in input string and resync `prev_c', `next_c' + and pos. */ + DPRINT((" back reference matched\n")); + str_byte += bt_len - 1; +#ifdef TRE_WCHAR + str_wide += bt_len - 1; +#endif /* TRE_WCHAR */ + pos += bt_len - 1; + GET_NEXT_WCHAR(); + DPRINT((" pos now %zd\n", pos)); + } + else + { + DPRINT((" back reference did not match\n")); + goto backtrack; + } + } + else + { + /* Check for end of string. */ + if (len < 0) + { + if (type == STR_USER) + { + if (str_user_end) + goto backtrack; + } + else if (next_c == L'\0' || pos >= TRE_MAX_STRING) + goto backtrack; + } + else + { + if (pos >= len) + goto backtrack; + } + + /* Read the next character. */ + GET_NEXT_WCHAR(); + } + + next_state = NULL; + for (trans_i = state; trans_i->state; trans_i++) + { + DPRINT((" transition %d-%d (%c-%c) %d to %d\n", + trans_i->code_min, trans_i->code_max, + trans_i->code_min, trans_i->code_max, + trans_i->assertions, trans_i->state_id)); + if (trans_i->code_min <= (tre_cint_t)prev_c + && trans_i->code_max >= (tre_cint_t)prev_c) + { + if (trans_i->assertions + && (CHECK_ASSERTIONS(trans_i->assertions) + || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) + { + DPRINT((" assertion failed\n")); + continue; + } + + if (next_state == NULL) + { + /* First matching transition. */ + DPRINT((" Next state is %d\n", trans_i->state_id)); + next_state = trans_i->state; + next_tags = trans_i->tags; + } + else + { + /* Second matching transition. We may need to backtrack here + to take this transition instead of the first one, so we + push this transition in the backtracking stack so we can + jump back here if needed. */ + DPRINT((" saving state %d for backtracking\n", + trans_i->state_id)); + BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + trans_i->state_id, next_c, tags, mbstate); + { + int *tmp; + for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) + stack->item.tags[*tmp] = pos; + } +#if 0 /* XXX - it's important not to look at all transitions here to keep + the stack small! */ + break; +#endif + } + } + } + + if (next_state != NULL) + { + /* Matching transitions were found. Take the first one. */ + state = next_state; + + /* Update the tag values. */ + if (next_tags) + while (*next_tags >= 0) + tags[*next_tags++] = pos; + } + else + { + backtrack: + /* A matching transition was not found. Try to backtrack. */ + if (stack->prev) + { + DPRINT((" backtracking\n")); + if (stack->item.state->assertions & ASSERT_BACKREF) + { + DPRINT((" states_seen[%d] = 0\n", + stack->item.state_id)); + states_seen[stack->item.state_id] = 0; + } + + BT_STACK_POP(); + } + else if (match_eo < 0) + { + /* Try starting from a later position in the input string. */ + /* Check for end of string. */ + if (len < 0) + { + if (next_c_start == L'\0' || pos_start >= TRE_MAX_STRING) + { + DPRINT(("end of string.\n")); + break; + } + } + else + { + if (pos_start >= len) + { + DPRINT(("end of string.\n")); + break; + } + } + DPRINT(("restarting from next start position\n")); + next_c = (tre_char_t) next_c_start; +#ifdef TRE_MBSTATE + mbstate = mbstate_start; +#endif /* TRE_MBSTATE */ + str_byte = str_byte_start; +#ifdef TRE_WCHAR + str_wide = str_wide_start; +#endif /* TRE_WCHAR */ + goto retry; + } + else + { + DPRINT(("finished\n")); + break; + } + } + } + + ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; + *match_end_ofs = match_eo; + + error_exit: + tre_bt_mem_destroy(mem); +#ifndef TRE_USE_ALLOCA + if (tags) + xafree(tags); + if (pmatch) + xafree(pmatch); + if (states_seen) + xafree(states_seen); +#endif /* !TRE_USE_ALLOCA */ + + return ret; +} diff --git a/deps/tre/lib/tre-match-parallel.c b/deps/tre/lib/tre-match-parallel.c new file mode 100644 index 000000000..151083746 --- /dev/null +++ b/deps/tre/lib/tre-match-parallel.c @@ -0,0 +1,538 @@ +/* + tre-match-parallel.c - TRE parallel regex matching engine + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + This algorithm searches for matches basically by reading characters + in the searched string one by one, starting at the beginning. All + matching paths in the TNFA are traversed in parallel. When two or + more paths reach the same state, exactly one is chosen according to + tag ordering rules; if returning submatches is not required it does + not matter which path is chosen. + + The worst case time required for finding the leftmost and longest + match, or determining that there is no match, is always linearly + dependent on the length of the text being searched. + + This algorithm cannot handle TNFAs with back referencing nodes. + See `tre-match-backtrack.c'. +*/ + + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#ifdef TRE_USE_ALLOCA +/* AIX requires this to be the first thing in the file. */ +#ifndef __GNUC__ +# if HAVE_ALLOCA_H +# include +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +char *alloca (); +# endif +# endif +# endif +#endif +#endif /* TRE_USE_ALLOCA */ + +#include +#include +#include +#include +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ +#ifdef HAVE_WCTYPE_H +#include +#endif /* HAVE_WCTYPE_H */ +#ifndef TRE_WCHAR +#include +#endif /* !TRE_WCHAR */ +#ifdef HAVE_MALLOC_H +#include +#endif /* HAVE_MALLOC_H */ + +#include "tre-internal.h" +#include "tre-match-utils.h" +#include "xmalloc.h" + + + +typedef struct { + tre_tnfa_transition_t *state; + int *tags; +} tre_tnfa_reach_t; + +typedef struct { + int pos; + int **tags; +} tre_reach_pos_t; + + +#ifdef TRE_DEBUG +static void +tre_print_reach(const tre_tnfa_reach_t *reach, int num_tags) +{ + int i; + + while (reach->state != NULL) + { + DPRINT((" %p", (void *)reach->state)); + if (num_tags > 0) + { + DPRINT(("/")); + for (i = 0; i < num_tags; i++) + { + DPRINT(("%d:%d", i, reach->tags[i])); + if (i < (num_tags-1)) + DPRINT((",")); + } + } + reach++; + } + DPRINT(("\n")); + +} +#endif /* TRE_DEBUG */ + +reg_errcode_t +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, ssize_t len, + tre_str_type_t type, int *match_tags, int eflags, + int *match_end_ofs) +{ + /* State variables required by GET_NEXT_WCHAR. */ + tre_char_t prev_c = 0, next_c = 0; + const char *str_byte = string; + ssize_t pos = -1; + unsigned int pos_add_next = 1; +#ifdef TRE_WCHAR + const wchar_t *str_wide = string; +#ifdef TRE_MBSTATE + mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +#endif /* TRE_WCHAR */ + reg_errcode_t ret; + int reg_notbol = eflags & REG_NOTBOL; + int reg_noteol = eflags & REG_NOTEOL; + int reg_newline = tnfa->cflags & REG_NEWLINE; + int str_user_end = 0; + + char *buf; + tre_tnfa_transition_t *trans_i; + tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; + tre_reach_pos_t *reach_pos; + int *tag_i; + int num_tags, i; + + int match_eo = -1; /* end offset of match (-1 if no match found yet) */ + int new_match = 0; + int *tmp_tags = NULL; + int *tmp_iptr; + + /* + * TRE internals tend to use int instead of size_t for positions or + * lengths and don't check for overflow. This will take time to fix + * properly. In the meantime, simply limit the input to what we can + * handle. + */ + if (len > TRE_MAX_STRING) + len = TRE_MAX_STRING; + +#ifdef TRE_MBSTATE + memset(&mbstate, '\0', sizeof(mbstate)); +#endif /* TRE_MBSTATE */ + + DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); + + if (!match_tags) + num_tags = 0; + else + num_tags = tnfa->num_tags; + + /* Allocate memory for temporary data required for matching. This needs to + be done for every matching operation to be thread safe. This allocates + everything in a single large block from the stack frame using alloca() + or with malloc() if alloca is unavailable. */ + { + size_t tbytes, rbytes, pbytes, xbytes, total_bytes; + size_t num_states = (size_t)tnfa->num_states; + size_t state_tag_bytes, reach_bytes; + size_t padding = (sizeof(long) - 1) * 4; + char *tmp_buf; + + if (num_states > SIZE_MAX / sizeof(*reach_pos)) + return REG_ESPACE; + pbytes = sizeof(*reach_pos) * num_states; + + if (num_states + 1 > SIZE_MAX / sizeof(*reach_next)) + return REG_ESPACE; + rbytes = sizeof(*reach_next) * (num_states + 1); + + if ((size_t)num_tags > SIZE_MAX / sizeof(*tmp_tags)) + return REG_ESPACE; + tbytes = sizeof(*tmp_tags) * (size_t)num_tags; + + if ((size_t)num_tags > SIZE_MAX / sizeof(int)) + return REG_ESPACE; + xbytes = sizeof(int) * (size_t)num_tags; + + if (num_states > 0 && xbytes > SIZE_MAX / num_states) + return REG_ESPACE; + state_tag_bytes = xbytes * num_states; + + if (rbytes > SIZE_MAX - state_tag_bytes) + return REG_ESPACE; + reach_bytes = rbytes + state_tag_bytes; + + if (reach_bytes > (SIZE_MAX - padding - tbytes - pbytes) / 2) + return REG_ESPACE; + + /* Compute the length of the block we need. */ + total_bytes = + padding + reach_bytes * 2 + tbytes + pbytes; + + /* Allocate the memory. */ +#ifdef TRE_USE_ALLOCA + buf = alloca(total_bytes); +#else /* !TRE_USE_ALLOCA */ + buf = xmalloc(total_bytes); +#endif /* !TRE_USE_ALLOCA */ + if (buf == NULL) + return REG_ESPACE; + memset(buf, 0, total_bytes); + + /* Get the various pointers within tmp_buf (properly aligned). */ + tmp_tags = (void *)buf; + tmp_buf = buf + tbytes; + tmp_buf += ALIGN(tmp_buf, long); + reach_next = (void *)tmp_buf; + tmp_buf += rbytes; + tmp_buf += ALIGN(tmp_buf, long); + reach = (void *)tmp_buf; + tmp_buf += rbytes; + tmp_buf += ALIGN(tmp_buf, long); + reach_pos = (void *)tmp_buf; + tmp_buf += pbytes; + tmp_buf += ALIGN(tmp_buf, long); + for (i = 0; i < tnfa->num_states; i++) + { + reach[i].tags = (void *)tmp_buf; + tmp_buf += xbytes; + reach_next[i].tags = (void *)tmp_buf; + tmp_buf += xbytes; + } + } + + for (i = 0; i < tnfa->num_states; i++) + reach_pos[i].pos = -1; + + /* If only one character can start a match, find it first. */ + if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) + { + const char *orig_str = str_byte; + int first = tnfa->first_char; + + if (len >= 0) + str_byte = memchr(orig_str, first, (size_t)len); + else + str_byte = strchr(orig_str, first); + if (str_byte == NULL) + { +#ifndef TRE_USE_ALLOCA + if (buf) + xfree(buf); +#endif /* !TRE_USE_ALLOCA */ + return REG_NOMATCH; + } + DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); + if (str_byte >= orig_str + 1) + prev_c = (unsigned char)*(str_byte - 1); + next_c = (unsigned char)*str_byte; + pos = str_byte - orig_str; + if (len < 0 || pos < len) + str_byte++; + } + else + { + GET_NEXT_WCHAR(); + pos = 0; + } + +#if 0 + /* Skip over characters that cannot possibly be the first character + of a match. */ + if (tnfa->firstpos_chars != NULL) + { + char *chars = tnfa->firstpos_chars; + + if (len < 0) + { + const char *orig_str = str_byte; + /* XXX - use strpbrk() and wcspbrk() because they might be + optimized for the target architecture. Try also strcspn() + and wcscspn() and compare the speeds. */ + while (next_c != L'\0' && !chars[next_c]) + { + next_c = *str_byte++; + } + prev_c = *(str_byte - 2); + pos += str_byte - orig_str; + DPRINT(("skipped %d chars\n", str_byte - orig_str)); + } + else + { + while (pos <= len && !chars[next_c]) + { + prev_c = next_c; + next_c = (unsigned char)(*str_byte++); + pos++; + } + } + } +#endif + + DPRINT(("length: %zd\n", len)); + DPRINT(("pos:chr/code | states and tags\n")); + DPRINT(("-------------+------------------------------------------------\n")); + + reach_next_i = reach_next; + while (/*CONSTCOND*/(void)1,1) + { + /* If no match found yet, add the initial states to `reach_next'. */ + if (match_eo < 0) + { + DPRINT((" init >")); + trans_i = tnfa->initial; + while (trans_i->state != NULL) + { + if (reach_pos[trans_i->state_id].pos < pos) + { + if (trans_i->assertions + && CHECK_ASSERTIONS(trans_i->assertions)) + { + DPRINT(("assertion failed\n")); + trans_i++; + continue; + } + + DPRINT((" %p", (void *)trans_i->state)); + reach_next_i->state = trans_i->state; + for (i = 0; i < num_tags; i++) + reach_next_i->tags[i] = -1; + tag_i = trans_i->tags; + if (tag_i) + while (*tag_i >= 0) + { + if (*tag_i < num_tags) + reach_next_i->tags[*tag_i] = pos; + tag_i++; + } + if (reach_next_i->state == tnfa->final) + { + DPRINT((" found empty match\n")); + match_eo = pos; + new_match = 1; + for (i = 0; i < num_tags; i++) + match_tags[i] = reach_next_i->tags[i]; + } + reach_pos[trans_i->state_id].pos = pos; + reach_pos[trans_i->state_id].tags = &reach_next_i->tags; + reach_next_i++; + } + trans_i++; + } + DPRINT(("\n")); + reach_next_i->state = NULL; + } + else + { + if (num_tags == 0 || reach_next_i == reach_next) + /* We have found a match. */ + break; + } + + /* Check for end of string. */ + if (len < 0) + { + if (type == STR_USER) + { + if (str_user_end) + break; + } + else if (next_c == L'\0' || pos >= TRE_MAX_STRING) + break; + } + else + { + if (pos >= len) + break; + } + + GET_NEXT_WCHAR(); + +#ifdef TRE_DEBUG + DPRINT(("%3zd:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); + tre_print_reach(reach_next, num_tags); + DPRINT(("%3zd:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); + tre_print_reach(reach_next, num_tags); +#endif /* TRE_DEBUG */ + + /* Swap `reach' and `reach_next'. */ + reach_i = reach; + reach = reach_next; + reach_next = reach_i; + + /* For each state in `reach', weed out states that don't fulfill the + minimal matching conditions. */ + if (tnfa->num_minimals && new_match) + { + new_match = 0; + reach_next_i = reach_next; + for (reach_i = reach; reach_i->state; reach_i++) + { + int skip = 0; + for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) + { + int end = tnfa->minimal_tags[i]; + int start = tnfa->minimal_tags[i + 1]; + DPRINT((" Minimal start %d, end %d\n", start, end)); + if (end >= num_tags) + { + DPRINT((" Throwing %p out.\n", reach_i->state)); + skip = 1; + break; + } + else if (reach_i->tags[start] == match_tags[start] + && reach_i->tags[end] < match_tags[end]) + { + DPRINT((" Throwing %p out because t%d < %d\n", + reach_i->state, end, match_tags[end])); + skip = 1; + break; + } + } + if (!skip) + { + reach_next_i->state = reach_i->state; + tmp_iptr = reach_next_i->tags; + reach_next_i->tags = reach_i->tags; + reach_i->tags = tmp_iptr; + reach_next_i++; + } + } + reach_next_i->state = NULL; + + /* Swap `reach' and `reach_next'. */ + reach_i = reach; + reach = reach_next; + reach_next = reach_i; + } + + /* For each state in `reach' see if there is a transition leaving with + the current input symbol to a state not yet in `reach_next', and + add the destination states to `reach_next'. */ + reach_next_i = reach_next; + for (reach_i = reach; reach_i->state; reach_i++) + { + for (trans_i = reach_i->state; trans_i->state; trans_i++) + { + /* Does this transition match the input symbol? */ + if (trans_i->code_min <= (tre_cint_t)prev_c && + trans_i->code_max >= (tre_cint_t)prev_c) + { + if (trans_i->assertions + && (CHECK_ASSERTIONS(trans_i->assertions) + || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) + { + DPRINT(("assertion failed\n")); + continue; + } + + /* Compute the tags after this transition. */ + for (i = 0; i < num_tags; i++) + tmp_tags[i] = reach_i->tags[i]; + tag_i = trans_i->tags; + if (tag_i != NULL) + while (*tag_i >= 0) + { + if (*tag_i < num_tags) + tmp_tags[*tag_i] = pos; + tag_i++; + } + + if (reach_pos[trans_i->state_id].pos < pos) + { + /* Found an unvisited node. */ + reach_next_i->state = trans_i->state; + tmp_iptr = reach_next_i->tags; + reach_next_i->tags = tmp_tags; + tmp_tags = tmp_iptr; + reach_pos[trans_i->state_id].pos = pos; + reach_pos[trans_i->state_id].tags = &reach_next_i->tags; + + if (reach_next_i->state == tnfa->final + && (match_eo == -1 + || (num_tags > 0 + && reach_next_i->tags[0] <= match_tags[0]))) + { + DPRINT((" found match %p\n", trans_i->state)); + match_eo = pos; + new_match = 1; + for (i = 0; i < num_tags; i++) + match_tags[i] = reach_next_i->tags[i]; + } + reach_next_i++; + + } + else + { + assert(reach_pos[trans_i->state_id].pos == pos); + /* Another path has also reached this state. We choose + the winner by examining the tag values for both + paths. */ + if (tre_tag_order(num_tags, tnfa->tag_directions, + tmp_tags, + *reach_pos[trans_i->state_id].tags)) + { + /* The new path wins. */ + tmp_iptr = *reach_pos[trans_i->state_id].tags; + *reach_pos[trans_i->state_id].tags = tmp_tags; + if (trans_i->state == tnfa->final) + { + DPRINT((" found better match\n")); + match_eo = pos; + new_match = 1; + for (i = 0; i < num_tags; i++) + match_tags[i] = tmp_tags[i]; + } + tmp_tags = tmp_iptr; + } + } + } + } + } + reach_next_i->state = NULL; + } + + DPRINT(("match end offset = %d\n", match_eo)); + + *match_end_ofs = match_eo; + ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; + +#ifndef TRE_USE_ALLOCA + if (buf) + xfree(buf); +#endif /* !TRE_USE_ALLOCA */ + return ret; +} + +/* EOF */ diff --git a/deps/tre/lib/tre-match-utils.h b/deps/tre/lib/tre-match-utils.h new file mode 100644 index 000000000..76e8b1972 --- /dev/null +++ b/deps/tre/lib/tre-match-utils.h @@ -0,0 +1,215 @@ +/* + tre-match-utils.h - TRE matcher helper definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#define str_source ((const tre_str_source*)string) + +#ifdef TRE_WCHAR + +#ifdef TRE_MULTIBYTE + +/* Wide character and multibyte support. */ + +#define GET_NEXT_WCHAR() \ + do { \ + prev_c = next_c; \ + if (type == STR_BYTE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = '\0'; \ + else \ + next_c = (unsigned char)(*str_byte++); \ + } \ + else if (type == STR_WIDE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = L'\0'; \ + else \ + next_c = *str_wide++; \ + } \ + else if (type == STR_MBS) \ + { \ + pos += pos_add_next; \ + if (str_byte == NULL) \ + next_c = L'\0'; \ + else \ + { \ + size_t w; \ + size_t max; \ + if (len >= 0) \ + max = len - pos; \ + else \ + max = 32; \ + if (max <= 0) \ + { \ + next_c = L'\0'; \ + pos_add_next = 1; \ + } \ + else \ + { \ + w = tre_mbrtowc(&next_c, str_byte, (size_t)max, &mbstate); \ + if (w == (size_t)-1 || w == (size_t)-2) \ + return REG_NOMATCH; \ + if (w == 0 && len >= 0) \ + { \ + pos_add_next = 1; \ + next_c = 0; \ + str_byte++; \ + } \ + else \ + { \ + pos_add_next = w; \ + str_byte += w; \ + } \ + } \ + } \ + } \ + else if (type == STR_USER) \ + { \ + pos += pos_add_next; \ + str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ + str_source->context); \ + } \ + } while(/*CONSTCOND*/(void)0,0) + +#else /* !TRE_MULTIBYTE */ + +/* Wide character support, no multibyte support. */ + +#define GET_NEXT_WCHAR() \ + do { \ + prev_c = next_c; \ + if (type == STR_BYTE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = '\0'; \ + else \ + next_c = (unsigned char)(*str_byte++); \ + } \ + else if (type == STR_WIDE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = L'\0'; \ + else \ + next_c = *str_wide++; \ + } \ + else if (type == STR_USER) \ + { \ + pos += pos_add_next; \ + str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ + str_source->context); \ + } \ + } while(/*CONSTCOND*/(void)0,0) + +#endif /* !TRE_MULTIBYTE */ + +#else /* !TRE_WCHAR */ + +/* No wide character or multibyte support. */ + +#define GET_NEXT_WCHAR() \ + do { \ + prev_c = next_c; \ + if (type == STR_BYTE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = '\0'; \ + else \ + next_c = (unsigned char)(*str_byte++); \ + } \ + else if (type == STR_USER) \ + { \ + pos += pos_add_next; \ + str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ + str_source->context); \ + } \ + } while(/*CONSTCOND*/(void)0,0) + +#endif /* !TRE_WCHAR */ + + + +#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c)) + +#define CHECK_ASSERTIONS(assertions) \ + (((assertions & ASSERT_AT_BOL) \ + && (pos > 0 || reg_notbol) \ + && (prev_c != L'\n' || !reg_newline)) \ + || ((assertions & ASSERT_AT_EOL) \ + && (next_c != L'\0' || reg_noteol) \ + && (next_c != L'\n' || !reg_newline)) \ + || ((assertions & ASSERT_AT_BOW) \ + && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_EOW) \ + && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_WB) \ + && (pos != 0 && next_c != L'\0' \ + && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_WB_NEG) \ + && (pos == 0 || next_c == L'\0' \ + || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) + +#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ + (((trans_i->assertions & ASSERT_CHAR_CLASS) \ + && !(tnfa->cflags & REG_ICASE) \ + && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \ + || ((trans_i->assertions & ASSERT_CHAR_CLASS) \ + && (tnfa->cflags & REG_ICASE) \ + && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ + && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ + || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ + && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ + tnfa->cflags & REG_ICASE))) + + + + +/* Returns 1 if `t1' wins `t2', 0 otherwise. */ +inline static int +tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, + int *t1, int *t2) +{ + int i; + for (i = 0; i < num_tags; i++) + { + if (tag_directions[i] == TRE_TAG_MINIMIZE) + { + if (t1[i] < t2[i]) + return 1; + if (t1[i] > t2[i]) + return 0; + } + else + { + if (t1[i] > t2[i]) + return 1; + if (t1[i] < t2[i]) + return 0; + } + } + /* assert(0);*/ + return 0; +} + +inline static int +tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) +{ + DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase)); + while (*classes != (tre_ctype_t)0) + if ((!icase && tre_isctype(wc, *classes)) + || (icase && (tre_isctype(tre_toupper(wc), *classes) + || tre_isctype(tre_tolower(wc), *classes)))) + return 1; /* Match. */ + else + classes++; + return 0; /* No match. */ +} diff --git a/deps/tre/lib/tre-mem.c b/deps/tre/lib/tre-mem.c new file mode 100644 index 000000000..ca56d2b7e --- /dev/null +++ b/deps/tre/lib/tre-mem.c @@ -0,0 +1,155 @@ +/* + tre-mem.c - TRE memory allocator + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + This memory allocator is for allocating small memory blocks efficiently + in terms of memory overhead and execution speed. The allocated blocks + cannot be freed individually, only all at once. There can be multiple + allocators, though. +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#include "tre-internal.h" +#include "tre-mem.h" +#include "xmalloc.h" + + +/* Returns a new memory allocator or NULL if out of memory. */ +tre_mem_t +tre_mem_new_impl(int provided, void *provided_block) +{ + tre_mem_t mem; + if (provided) + { + mem = provided_block; + memset(mem, 0, sizeof(*mem)); + } + else + mem = xcalloc(1, sizeof(*mem)); + if (mem == NULL) + return NULL; + return mem; +} + + +/* Frees the memory allocator and all memory allocated with it. */ +void +tre_mem_destroy(tre_mem_t mem) +{ + tre_list_t *tmp, *l = mem->blocks; + + while (l != NULL) + { + xfree(l->data); + tmp = l->next; + xfree(l); + l = tmp; + } + xfree(mem); +} + + +/* Allocates a block of `size' bytes from `mem'. Returns a pointer to the + allocated block or NULL if an underlying malloc() failed. */ +void * +tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, + int zero, size_t size) +{ + void *ptr; + + if (mem->failed) + { + DPRINT(("tre_mem_alloc: oops, called after failure?!\n")); + return NULL; + } + +#ifdef MALLOC_DEBUGGING + if (!provided) + { + ptr = xmalloc(1); + if (ptr == NULL) + { + DPRINT(("tre_mem_alloc: xmalloc forced failure\n")); + mem->failed = 1; + return NULL; + } + xfree(ptr); + } +#endif /* MALLOC_DEBUGGING */ + + if (mem->n < size) + { + /* We need more memory than is available in the current block. + Allocate a new block. */ + tre_list_t *l; + if (provided) + { + DPRINT(("tre_mem_alloc: using provided block\n")); + if (provided_block == NULL) + { + DPRINT(("tre_mem_alloc: provided block was NULL\n")); + mem->failed = 1; + return NULL; + } + mem->ptr = provided_block; + mem->n = TRE_MEM_BLOCK_SIZE; + } + else + { + size_t block_size; + if (size * 8 > TRE_MEM_BLOCK_SIZE) + block_size = size * 8; + else + block_size = TRE_MEM_BLOCK_SIZE; + DPRINT(("tre_mem_alloc: allocating new %zu byte block\n", + block_size)); + l = xmalloc(sizeof(*l)); + if (l == NULL) + { + mem->failed = 1; + return NULL; + } + l->data = xmalloc(block_size); + if (l->data == NULL) + { + xfree(l); + mem->failed = 1; + return NULL; + } + l->next = NULL; + if (mem->current != NULL) + mem->current->next = l; + if (mem->blocks == NULL) + mem->blocks = l; + mem->current = l; + mem->ptr = l->data; + mem->n = block_size; + } + } + + /* Make sure the next pointer will be aligned. */ + size += ALIGN(mem->ptr + size, long); + + /* Allocate from current block. */ + ptr = mem->ptr; + mem->ptr += size; + mem->n -= size; + + /* Set to zero if needed. */ + if (zero) + memset(ptr, 0, size); + + return ptr; +} + +/* EOF */ diff --git a/deps/tre/lib/tre-mem.h b/deps/tre/lib/tre-mem.h new file mode 100644 index 000000000..285940457 --- /dev/null +++ b/deps/tre/lib/tre-mem.h @@ -0,0 +1,66 @@ +/* + tre-mem.h - TRE memory allocator interface + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifndef TRE_MEM_H +#define TRE_MEM_H 1 + +#include + +#define TRE_MEM_BLOCK_SIZE 1024 + +typedef struct tre_list { + void *data; + struct tre_list *next; +} tre_list_t; + +typedef struct tre_mem_struct { + tre_list_t *blocks; + tre_list_t *current; + char *ptr; + size_t n; + int failed; + void **provided; +} *tre_mem_t; + + +tre_mem_t tre_mem_new_impl(int provided, void *provided_block); +void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, + int zero, size_t size); + +/* Returns a new memory allocator or NULL if out of memory. */ +#define tre_mem_new() tre_mem_new_impl(0, NULL) + +/* Allocates a block of `size' bytes from `mem'. Returns a pointer to the + allocated block or NULL if an underlying malloc() failed. */ +#define tre_mem_alloc(mem, size) tre_mem_alloc_impl(mem, 0, NULL, 0, size) + +/* Allocates a block of `size' bytes from `mem'. Returns a pointer to the + allocated block or NULL if an underlying malloc() failed. The memory + is set to zero. */ +#define tre_mem_calloc(mem, size) tre_mem_alloc_impl(mem, 0, NULL, 1, size) + +#ifdef TRE_USE_ALLOCA +/* alloca() versions. Like above, but memory is allocated with alloca() + instead of malloc(). */ + +#define tre_mem_newa() \ + tre_mem_new_impl(1, alloca(sizeof(struct tre_mem_struct))) + +#define tre_mem_alloca(mem, size) \ + ((mem)->n >= (size) \ + ? tre_mem_alloc_impl((mem), 1, NULL, 0, (size)) \ + : tre_mem_alloc_impl((mem), 1, alloca(TRE_MEM_BLOCK_SIZE), 0, (size))) +#endif /* TRE_USE_ALLOCA */ + + +/* Frees the memory allocator and all memory allocated with it. */ +void tre_mem_destroy(tre_mem_t mem); + +#endif /* TRE_MEM_H */ + +/* EOF */ diff --git a/deps/tre/lib/tre-parse.c b/deps/tre/lib/tre-parse.c new file mode 100644 index 000000000..64ab6aca8 --- /dev/null +++ b/deps/tre/lib/tre-parse.c @@ -0,0 +1,1758 @@ +/* + tre-parse.c - Regexp parser + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + This parser is just a simple recursive descent parser for POSIX.2 + regexps. The parser supports both the obsolete default syntax and + the "extended" syntax, and some nonstandard extensions. +*/ + + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include +#include + +#include "xmalloc.h" +#include "tre-mem.h" +#include "tre-ast.h" +#include "tre-stack.h" +#include "tre-parse.h" + + +/* Characters with special meanings in regexp syntax. */ +#define CHAR_PIPE L'|' +#define CHAR_LPAREN L'(' +#define CHAR_RPAREN L')' +#define CHAR_LBRACE L'{' +#define CHAR_RBRACE L'}' +#define CHAR_LBRACKET L'[' +#define CHAR_RBRACKET L']' +#define CHAR_MINUS L'-' +#define CHAR_STAR L'*' +#define CHAR_QUESTIONMARK L'?' +#define CHAR_PLUS L'+' +#define CHAR_PERIOD L'.' +#define CHAR_COLON L':' +#define CHAR_EQUAL L'=' +#define CHAR_COMMA L',' +#define CHAR_CARET L'^' +#define CHAR_DOLLAR L'$' +#define CHAR_BACKSLASH L'\\' +#define CHAR_HASH L'#' +#define CHAR_TILDE L'~' + + +/* Some macros for expanding \w, \s, etc. */ +static const struct tre_macro_struct { + const char c; + const char *expansion; +} tre_macros[] = + { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, + {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, + {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, + {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, + { 0, NULL } + }; + + +/* Expands a macro delimited by `regex' and `regex_end' to `buf', which + must have at least `len' items. Sets buf[0] to zero if the there + is no match in `tre_macros'. */ +static void +tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, + tre_char_t *buf, size_t buf_len) +{ + int i; + + buf[0] = 0; + if (regex >= regex_end) + return; + + for (i = 0; tre_macros[i].expansion; i++) + { + if (tre_macros[i].c == *regex) + { + unsigned int j; + DPRINT(("Expanding macro '%c' => '%s'\n", + tre_macros[i].c, tre_macros[i].expansion)); + for (j = 0; tre_macros[i].expansion[j] && j < buf_len - 1; j++) + buf[j] = tre_macros[i].expansion[j]; + buf[j] = 0; + break; + } + } +} + +static reg_errcode_t +tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, + tre_ast_node_t ***items) +{ + reg_errcode_t status; + tre_ast_node_t **array = *items; + /* Allocate more space if necessary. */ + if (*i >= *max_i) + { + tre_ast_node_t **new_items; + DPRINT(("out of array space, i = %d\n", *i)); + /* If the array is already 1024 items large, give up -- there's + probably an error in the regexp (e.g. not a '\0' terminated + string and missing ']') */ + if (*max_i > 1024) + return REG_ESPACE; + *max_i *= 2; + new_items = xrealloc(array, sizeof(*items) * *max_i); + if (new_items == NULL) + return REG_ESPACE; + *items = array = new_items; + } + array[*i] = tre_ast_new_literal(mem, min, max); + status = array[*i] == NULL ? REG_ESPACE : REG_OK; + (*i)++; + return status; +} + + +/* Expands a character class to character ranges. */ +static reg_errcode_t +tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, + int *i, int *max_i, int cflags) +{ + reg_errcode_t status = REG_OK; + tre_cint_t c; + int j, min = -1, max = 0; + + DPRINT((" expanding class to character ranges\n")); + for (j = 0; (j < 256) && (status == REG_OK); j++) + { + c = (tre_cint_t) j; + if (tre_isctype(c, class) + || ((cflags & REG_ICASE) + && (tre_isctype(tre_tolower(c), class) + || tre_isctype(tre_toupper(c), class)))) +{ + if (min < 0) + min = c; + max = c; + } + else if (min >= 0) + { + DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); + status = tre_new_item(mem, min, max, i, max_i, items); + min = -1; + } + } + if (min >= 0 && status == REG_OK) + status = tre_new_item(mem, min, max, i, max_i, items); + return status; +} + + +static int +tre_compare_items(const void *a, const void *b) +{ + const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; + const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; + tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; + long a_min = l_a->code_min, b_min = l_b->code_min; + + if (a_min < b_min) + return -1; + else if (a_min > b_min) + return 1; + else + return 0; +} + +#ifndef TRE_USE_SYSTEM_WCTYPE + +/* isalnum() and the rest may be macros, so wrap them to functions. */ +int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } +int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } + +#ifdef tre_isascii +int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } +#else /* !tre_isascii */ +int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } +#endif /* !tre_isascii */ + +#ifdef tre_isblank +int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } +#else /* !tre_isblank */ +int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } +#endif /* !tre_isblank */ + +int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } +int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } +int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } +int tre_islower_func(tre_cint_t c) { return tre_islower(c); } +int tre_isprint_func(tre_cint_t c) +{ + return +#if defined(WIN32) && TRE_WCHAR + /* On Windows, iswprint(L'\t') incorrectly returns true. */ + c != L'\t' && +#endif + tre_isprint(c); +} +int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } +int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } +int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } +int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } + +struct { + char *name; + int (*func)(tre_cint_t); +} tre_ctype_map[] = { + { "alnum", &tre_isalnum_func }, + { "alpha", &tre_isalpha_func }, +#ifdef tre_isascii + { "ascii", &tre_isascii_func }, +#endif /* tre_isascii */ +#ifdef tre_isblank + { "blank", &tre_isblank_func }, +#endif /* tre_isblank */ + { "cntrl", &tre_iscntrl_func }, + { "digit", &tre_isdigit_func }, + { "graph", &tre_isgraph_func }, + { "lower", &tre_islower_func }, + { "print", &tre_isprint_func }, + { "punct", &tre_ispunct_func }, + { "space", &tre_isspace_func }, + { "upper", &tre_isupper_func }, + { "xdigit", &tre_isxdigit_func }, + { NULL, NULL} +}; + +tre_ctype_t tre_ctype(const char *name) +{ + int i; + for (i = 0; tre_ctype_map[i].name != NULL; i++) + { + if (strcmp(name, tre_ctype_map[i].name) == 0) + return tre_ctype_map[i].func; + } + return (tre_ctype_t)0; +} +#endif /* !TRE_USE_SYSTEM_WCTYPE */ + +/* Maximum number of character classes that can occur in a negated bracket + expression. */ +#define MAX_NEG_CLASSES 64 + +/* Maximum length of character class names. */ +#define MAX_CLASS_NAME + +#define REST(re) (int)(ctx->re_end - (re)), (re) + +static reg_errcode_t +tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, + tre_ctype_t neg_classes[], int *num_neg_classes, + tre_ast_node_t ***items, int *num_items, + int *items_size) +{ + const tre_char_t *re = ctx->re; + reg_errcode_t status; + tre_ctype_t class = (tre_ctype_t)0; + tre_cint_t min = 0, max = 0; + int i = *num_items; + int max_i = *items_size; + int skip; + + /* Build an array of the items in the bracket expression. */ + for (;;) + { + skip = 0; + if (re == ctx->re_end) + { + return REG_EBRACK; + } + if (*re == CHAR_RBRACKET && re > ctx->re) + { + DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); + re++; + break; + } + class = (tre_ctype_t)0; + if (re + 2 < ctx->re_end + && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET) + { + DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); + min = *re; + max = *(re + 2); + re += 3; + /* XXX - Should use collation order instead of encoding values + in character ranges. */ + if (min > max) + return REG_ERANGE; + } + else if (re + 1 < ctx->re_end + && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) + return REG_ECOLLATE; + else if (re + 1 < ctx->re_end + && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) + return REG_ECOLLATE; + else if (re + 1 < ctx->re_end + && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) + { + char tmp_str[64]; + const tre_char_t *endptr = re + 2; + size_t len; + DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); + while (endptr < ctx->re_end && *endptr != CHAR_COLON) + endptr++; + if (endptr != ctx->re_end) + { + len = MIN(endptr - re - 2, 63); +#ifdef TRE_WCHAR + { + tre_char_t tmp_wcs[64]; + wcsncpy(tmp_wcs, re + 2, len); + tmp_wcs[len] = L'\0'; +#if defined HAVE_WCSRTOMBS + { + mbstate_t state; + const tre_char_t *src = tmp_wcs; + memset(&state, '\0', sizeof(state)); + len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); + } +#elif defined HAVE_WCSTOMBS + len = wcstombs(tmp_str, tmp_wcs, 63); +#endif /* defined HAVE_WCSTOMBS */ + if (len == (size_t)-1) + return REG_ECTYPE; + } +#else /* !TRE_WCHAR */ + strncpy(tmp_str, (const char*)re + 2, len); +#endif /* !TRE_WCHAR */ + tmp_str[len] = '\0'; + DPRINT((" class name: %s\n", tmp_str)); + class = tre_ctype(tmp_str); + if (!class) + return REG_ECTYPE; + /* Optimize character classes for 8 bit character sets. */ + if (ctx->mb_cur_max == 1) + { + status = tre_expand_ctype(ctx->mem, class, items, + &i, &max_i, ctx->cflags); + if (status != REG_OK) + return status; + class = (tre_ctype_t)0; + skip = 1; + } + re = endptr + 2; + } + else + return REG_ECTYPE; + min = 0; + max = TRE_CHAR_MAX; + } + else + { + DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); + if (*re == CHAR_MINUS && re + 1 < ctx->re_end + && *(re + 1) != CHAR_RBRACKET + && ctx->re != re) + /* Two ranges are not allowed to share and endpoint. */ + return REG_ERANGE; + min = max = *re++; + } + + if (class && negate) + if (*num_neg_classes >= MAX_NEG_CLASSES) + return REG_ESPACE; + else + neg_classes[(*num_neg_classes)++] = class; + else if (!skip) + { + status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); + if (status != REG_OK) + return status; + ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class; + } + + /* Add opposite-case counterpoints if REG_ICASE is present. + This is broken if there are more than two "same" characters. */ + if (ctx->cflags & REG_ICASE && !class && !skip) + { + tre_cint_t cmin, ccurr; + + DPRINT(("adding opposite-case counterpoints\n")); + while (min <= max) + { + if (tre_islower(min)) + { + cmin = ccurr = tre_toupper(min++); + while (tre_islower(min) && tre_toupper(min) == ccurr + 1 + && min <= max) + ccurr = tre_toupper(min++); + status = tre_new_item(ctx->mem, cmin, ccurr, + &i, &max_i, items); + if (status != REG_OK) + return status; + } + else if (tre_isupper(min)) + { + cmin = ccurr = tre_tolower(min++); + while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 + && min <= max) + ccurr = tre_tolower(min++); + status = tre_new_item(ctx->mem, cmin, ccurr, + &i, &max_i, items); + if (status != REG_OK) + return status; + } + else + min++; + } + } + } + *num_items = i; + *items_size = max_i; + ctx->re = re; + return REG_OK; +} + +static reg_errcode_t +tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +{ + tre_ast_node_t *node = NULL; + int negate = 0; + reg_errcode_t status = REG_OK; + tre_ast_node_t **items, *u, *n; + int i = 0, j, max_i = 32; + long curr_max, curr_min; + tre_ctype_t neg_classes[MAX_NEG_CLASSES]; + int num_neg_classes = 0; + + /* Start off with an array of `max_i' elements. */ + items = xmalloc(sizeof(*items) * max_i); + if (items == NULL) + return REG_ESPACE; + + if (ctx->re < ctx->re_end && *ctx->re == CHAR_CARET) + { + DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); + negate = 1; + ctx->re++; + } + + status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, + &items, &i, &max_i); + + if (status != REG_OK) + goto parse_bracket_done; + + /* Sort the array if we need to negate it. */ + if (negate) + qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); + + curr_max = curr_min = 0; + /* Build a union of the items in the array, negated if necessary. */ + for (j = 0; j < i && status == REG_OK; j++) + { + long min, max; + tre_literal_t *l = items[j]->obj; + min = l->code_min; + max = l->code_max; + + DPRINT(("item: %ld - %ld, class %ld, curr_max = %ld\n", + l->code_min, l->code_max, (long)l->u.class, curr_max)); + + if (negate) + { + if (min < curr_max) + { + /* Overlap. */ + curr_max = MAX(max + 1, curr_max); + DPRINT(("overlap, curr_max = %ld\n", curr_max)); + l = NULL; + } + else + { + /* No overlap. */ + curr_max = min - 1; + if (curr_max >= curr_min) + { + DPRINT(("no overlap\n")); + l->code_min = curr_min; + l->code_max = curr_max; + } + else + { + DPRINT(("no overlap, zero room\n")); + l = NULL; + } + curr_min = curr_max = max + 1; + } + } + + if (l != NULL) + { + int k; + DPRINT(("creating %ld - %ld\n", l->code_min, l->code_max)); + if (num_neg_classes > 0) + { + l->neg_classes = tre_mem_alloc(ctx->mem, + (sizeof(l->neg_classes) + * (num_neg_classes + 1))); + if (l->neg_classes == NULL) + { + status = REG_ESPACE; + break; + } + for (k = 0; k < num_neg_classes; k++) + l->neg_classes[k] = neg_classes[k]; + l->neg_classes[k] = (tre_ctype_t)0; + } + else + l->neg_classes = NULL; + if (node == NULL) + node = items[j]; + else + { + u = tre_ast_new_union(ctx->mem, node, items[j]); + if (u == NULL) + status = REG_ESPACE; + node = u; + } + } + } + + if (status != REG_OK) + goto parse_bracket_done; + + if (negate) + { + int k; + DPRINT(("final: creating %ld - %ld\n", curr_min, (long)TRE_CHAR_MAX)); + n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX); + if (n == NULL) + status = REG_ESPACE; + else + { + tre_literal_t *l = n->obj; + if (num_neg_classes > 0) + { + l->neg_classes = tre_mem_alloc(ctx->mem, + (sizeof(l->neg_classes) + * (num_neg_classes + 1))); + if (l->neg_classes == NULL) + { + status = REG_ESPACE; + goto parse_bracket_done; + } + for (k = 0; k < num_neg_classes; k++) + l->neg_classes[k] = neg_classes[k]; + l->neg_classes[k] = (tre_ctype_t)0; + } + else + l->neg_classes = NULL; + if (node == NULL) + node = n; + else + { + u = tre_ast_new_union(ctx->mem, node, n); + if (u == NULL) + status = REG_ESPACE; + node = u; + } + } + } + + if (status != REG_OK) + goto parse_bracket_done; + +#ifdef TRE_DEBUG + tre_ast_print(node); +#endif /* TRE_DEBUG */ + + parse_bracket_done: + xfree(items); + *result = node; + return status; +} + + +/* Parses a positive decimal integer capped at INT_MAX. Returns -1 if the + string does not contain a valid number. */ +static int +tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) +{ + unsigned long num = 0; + int overflow = 0; + const tre_char_t *r = *regex; + while (r < regex_end && *r >= L'0' && *r <= L'9') + { + if (!overflow) + { + if (num * 10 + *r - L'0' < num) + { + overflow = 1; + } + else + { + num = num * 10 + *r - L'0'; + if (num > INT_MAX) + overflow = 1; + } + } + r++; + } + if (r == *regex) + return -1; + *regex = r; + return overflow ? INT_MAX : (int)num; +} + + +static reg_errcode_t +tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +{ + int min, max, i; + int cost_ins, cost_del, cost_subst, cost_max; + int limit_ins, limit_del, limit_subst, limit_err; + const tre_char_t *r = ctx->re; + const tre_char_t *start; + int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; + int approx = 0; + int costs_set = 0; + int counts_set = 0; + + cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; + limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; + + /* Parse number (minimum repetition count). */ + min = -1; + if (r < ctx->re_end && *r >= L'0' && *r <= L'9') { + DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); + min = tre_parse_int(&r, ctx->re_end); + } + + /* Parse comma and second number (maximum repetition count). */ + max = min; + if (r < ctx->re_end && *r == CHAR_COMMA) + { + if (min < 0) + min = 0; + r++; + DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); + max = tre_parse_int(&r, ctx->re_end); + } + + /* Check that the repeat counts are sane. */ + if (max >= 0 && min > max) + return REG_BADBR; + if (min > RE_DUP_MAX || max > RE_DUP_MAX) + return REG_BADMAX; + + + /* + '{' + optionally followed immediately by a number == minimum repcount + optionally followed by , then a number == maximum repcount + + then a number == maximum insertion count + - then a number == maximum deletion count + # then a number == maximum substitution count + ~ then a number == maximum number of errors + Any of +, -, # or ~ without followed by a number means that + the maximum count/number of errors is infinite. + + An equation of the form + Xi + Yd + Zs < C + can be specified to set costs and the cost limit to a value + different from the default value: + - X is the cost of an insertion + - Y is the cost of a deletion + - Z is the cost of a substitution + - C is the maximum cost + + If no count limit or cost is set for an operation, the operation + is not allowed at all. + */ + + + do { + int done; + start = r; + + /* Parse count limit settings */ + done = 0; + if (!counts_set) + while (r + 1 < ctx->re_end && !done) + { + switch (*r) + { + case CHAR_PLUS: /* Insert limit */ + DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); + r++; + limit_ins = tre_parse_int(&r, ctx->re_end); + if (limit_ins < 0) + limit_ins = INT_MAX; + counts_set = 1; + break; + case CHAR_MINUS: /* Delete limit */ + DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); + r++; + limit_del = tre_parse_int(&r, ctx->re_end); + if (limit_del < 0) + limit_del = INT_MAX; + counts_set = 1; + break; + case CHAR_HASH: /* Substitute limit */ + DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); + r++; + limit_subst = tre_parse_int(&r, ctx->re_end); + if (limit_subst < 0) + limit_subst = INT_MAX; + counts_set = 1; + break; + case CHAR_TILDE: /* Maximum number of changes */ + DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); + r++; + limit_err = tre_parse_int(&r, ctx->re_end); + if (limit_err < 0) + limit_err = INT_MAX; + approx = 1; + break; + case CHAR_COMMA: + r++; + break; + case L' ': + r++; + break; + case L'}': + done = 1; + break; + default: + done = 1; + break; + } + } + + /* Parse cost restriction equation. */ + done = 0; + if (!costs_set) + while (r + 1 < ctx->re_end && !done) + { + switch (*r) + { + case CHAR_PLUS: + case L' ': + r++; + break; + case L'<': + DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); + r++; + while (*r == L' ') + r++; + cost_max = tre_parse_int(&r, ctx->re_end); + if (cost_max < 0) + cost_max = INT_MAX; + else + cost_max--; + approx = 1; + break; + case CHAR_COMMA: + r++; + done = 1; + break; + default: + if (*r >= L'0' && *r <= L'9') + { +#ifdef TRE_DEBUG + const tre_char_t *sr = r; +#endif /* TRE_DEBUG */ + int cost = tre_parse_int(&r, ctx->re_end); + /* XXX - make sure r is not past end. */ + switch (*r) + { + case L'i': /* Insert cost */ + DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", + REST(sr))); + r++; + cost_ins = cost; + costs_set = 1; + break; + case L'd': /* Delete cost */ + DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", + REST(sr))); + r++; + cost_del = cost; + costs_set = 1; + break; + case L's': /* Substitute cost */ + DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", + REST(sr))); + r++; + cost_subst = cost; + costs_set = 1; + break; + default: + return REG_BADBR; + } + } + else + { + done = 1; + break; + } + } + } + } while (start != r); + + /* Missing }. */ + if (r >= ctx->re_end) + return REG_EBRACE; + + /* Empty contents of {}. */ + if (r == ctx->re) + return REG_BADBR; + + /* Parse the ending '}' or '\}'.*/ + if (ctx->cflags & REG_EXTENDED) + { + if (r >= ctx->re_end || *r != CHAR_RBRACE) + return REG_BADBR; + r++; + } + else + { + if (r + 1 >= ctx->re_end + || *r != CHAR_BACKSLASH + || *(r + 1) != CHAR_RBRACE) + return REG_BADBR; + r += 2; + } + + + /* Parse trailing '?' marking minimal repetition. */ + if (r < ctx->re_end) + { + if (*r == CHAR_QUESTIONMARK) + { + minimal = !(ctx->cflags & REG_UNGREEDY); + r++; + } + else if (*r == CHAR_STAR || *r == CHAR_PLUS) + { + /* These are reserved for future extensions. */ + return REG_BADRPT; + } + } + + /* Create the AST node(s). */ + if (min == 0 && max == 0) + { + *result = tre_ast_new_literal(ctx->mem, EMPTY, -1); + if (*result == NULL) + return REG_ESPACE; + } + else + { + if (min < 0 && max < 0) + /* Only approximate parameters set, no repetitions. */ + min = max = 1; + + *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); + if (!*result) + return REG_ESPACE; + + /* If approximate matching parameters are set, add them to the + iteration node. */ + if (approx || costs_set || counts_set) + { + int *params; + tre_iteration_t *iter = (*result)->obj; + + if (costs_set || counts_set) + { + if (limit_ins == TRE_PARAM_UNSET) + { + if (cost_ins == TRE_PARAM_UNSET) + limit_ins = 0; + else + limit_ins = INT_MAX; + } + + if (limit_del == TRE_PARAM_UNSET) + { + if (cost_del == TRE_PARAM_UNSET) + limit_del = 0; + else + limit_del = INT_MAX; + } + + if (limit_subst == TRE_PARAM_UNSET) + { + if (cost_subst == TRE_PARAM_UNSET) + limit_subst = 0; + else + limit_subst = INT_MAX; + } + } + + if (cost_max == TRE_PARAM_UNSET) + cost_max = INT_MAX; + if (limit_err == TRE_PARAM_UNSET) + limit_err = INT_MAX; + + ctx->have_approx = 1; + params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); + if (!params) + return REG_ESPACE; + for (i = 0; i < TRE_PARAM_LAST; i++) + params[i] = TRE_PARAM_UNSET; + params[TRE_PARAM_COST_INS] = cost_ins; + params[TRE_PARAM_COST_DEL] = cost_del; + params[TRE_PARAM_COST_SUBST] = cost_subst; + params[TRE_PARAM_COST_MAX] = cost_max; + params[TRE_PARAM_MAX_INS] = limit_ins; + params[TRE_PARAM_MAX_DEL] = limit_del; + params[TRE_PARAM_MAX_SUBST] = limit_subst; + params[TRE_PARAM_MAX_ERR] = limit_err; + iter->params = params; + } + } + + DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " + "limits [%d,%d,%d, total %d]\n", + min, max, cost_ins, cost_del, cost_subst, cost_max, + limit_ins, limit_del, limit_subst, limit_err)); + + + ctx->re = r; + return REG_OK; +} + +typedef enum { + PARSE_RE = 0, + PARSE_ATOM, + PARSE_MARK_FOR_SUBMATCH, + PARSE_BRANCH, + PARSE_PIECE, + PARSE_CATENATION, + PARSE_POST_CATENATION, + PARSE_UNION, + PARSE_POST_UNION, + PARSE_POSTFIX, + PARSE_RESTORE_CFLAGS +} tre_parse_re_stack_symbol_t; + + +reg_errcode_t +tre_parse(tre_parse_ctx_t *ctx) +{ + tre_ast_node_t *result = NULL; + tre_parse_re_stack_symbol_t symbol; + reg_errcode_t status = REG_OK; + tre_stack_t *stack = ctx->stack; + size_t bottom = tre_stack_num_items(stack); + int depth = 0; + int temporary_cflags = 0; + + DPRINT(("tre_parse: parsing '%.*" STRF "', len = %zu\n", + (int)ctx->len, ctx->re, ctx->len)); + + if (!ctx->nofirstsub) + { + STACK_PUSH(stack, int, ctx->submatch_id); + STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); + ctx->submatch_id++; + } + STACK_PUSH(stack, int, PARSE_RE); + ctx->re_start = ctx->re; + ctx->re_end = ctx->re + ctx->len; + + + /* The following is basically just a recursive descent parser. I use + an explicit stack instead of recursive functions mostly because of + two reasons: compatibility with systems which have an overflowable + call stack, and efficiency (both in lines of code and speed). */ + while (status == REG_OK && tre_stack_num_items(stack) > bottom) + { + symbol = tre_stack_pop_int(stack); + switch (symbol) + { + case PARSE_RE: + /* Parse a full regexp. A regexp is one or more branches, + separated by the union operator `|'. */ +#ifdef REG_LITERAL + if (!(ctx->cflags & REG_LITERAL) + && ctx->cflags & REG_EXTENDED) +#endif /* REG_LITERAL */ + STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, int, PARSE_BRANCH); + break; + + case PARSE_BRANCH: + /* Parse a branch. A branch is one or more pieces, concatenated. + A piece is an atom possibly followed by a postfix operator. */ + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); + break; + + case PARSE_PIECE: + /* Parse a piece. A piece is an atom possibly followed by one + or more postfix operators. */ +#ifdef REG_LITERAL + if (!(ctx->cflags & REG_LITERAL)) +#endif /* REG_LITERAL */ + STACK_PUSHX(stack, int, PARSE_POSTFIX); + STACK_PUSHX(stack, int, PARSE_ATOM); + break; + + case PARSE_CATENATION: + /* If the expression has not ended, parse another piece. */ + { + tre_char_t c; + if (ctx->re >= ctx->re_end) + break; + c = *ctx->re; +#ifdef REG_LITERAL + if (!(ctx->cflags & REG_LITERAL)) + { +#endif /* REG_LITERAL */ + if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) + break; + if ((ctx->cflags & REG_EXTENDED + && c == CHAR_RPAREN && depth > 0) + || (!(ctx->cflags & REG_EXTENDED) + && (c == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_RPAREN))) + { + if (!(ctx->cflags & REG_EXTENDED) && depth == 0) + status = REG_EPAREN; + DPRINT(("tre_parse: group end: '%.*" STRF "'\n", + REST(ctx->re))); + depth--; + if (!(ctx->cflags & REG_EXTENDED)) + ctx->re += 2; + break; + } +#ifdef REG_LITERAL + } +#endif /* REG_LITERAL */ + +#ifdef REG_RIGHT_ASSOC + if (ctx->cflags & REG_RIGHT_ASSOC) + { + /* Right associative concatenation. */ + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_CATENATION); + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); + } + else +#endif /* REG_RIGHT_ASSOC */ + { + /* Default case, left associative concatenation. */ + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); + } + break; + } + + case PARSE_POST_CATENATION: + { + tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); + tre_ast_node_t *tmp_node; + tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); + if (!tmp_node) + return REG_ESPACE; + result = tmp_node; + break; + } + + case PARSE_UNION: + if (ctx->re >= ctx->re_end) + break; +#ifdef REG_LITERAL + if (ctx->cflags & REG_LITERAL) + break; +#endif /* REG_LITERAL */ + switch (*ctx->re) + { + case CHAR_PIPE: + DPRINT(("tre_parse: union: '%.*" STRF "'\n", + REST(ctx->re))); + STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_UNION); + STACK_PUSHX(stack, int, PARSE_BRANCH); + ctx->re++; + break; + + case CHAR_RPAREN: + ctx->re++; + break; + + default: + break; + } + break; + + case PARSE_POST_UNION: + { + tre_ast_node_t *tmp_node; + tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); + tmp_node = tre_ast_new_union(ctx->mem, tree, result); + if (!tmp_node) + return REG_ESPACE; + result = tmp_node; + break; + } + + case PARSE_POSTFIX: + /* Parse postfix operators. */ + if (ctx->re >= ctx->re_end) + break; +#ifdef REG_LITERAL + if (ctx->cflags & REG_LITERAL) + break; +#endif /* REG_LITERAL */ + switch (*ctx->re) + { + case CHAR_PLUS: + case CHAR_QUESTIONMARK: + if (!(ctx->cflags & REG_EXTENDED)) + break; + /*FALLTHROUGH*/ + case CHAR_STAR: + { + tre_ast_node_t *tmp_node; + int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; + int rep_min = 0; + int rep_max = -1; +#ifdef TRE_DEBUG + const tre_char_t *tmp_re; +#endif + + if (*ctx->re == CHAR_PLUS) + rep_min = 1; + if (*ctx->re == CHAR_QUESTIONMARK) + rep_max = 1; +#ifdef TRE_DEBUG + tmp_re = ctx->re; +#endif + + if (ctx->re + 1 < ctx->re_end) + { + if (*(ctx->re + 1) == CHAR_QUESTIONMARK) + { + minimal = !(ctx->cflags & REG_UNGREEDY); + ctx->re++; + } + else if (*(ctx->re + 1) == CHAR_STAR + || *(ctx->re + 1) == CHAR_PLUS) + { + /* These are reserved for future extensions. */ + return REG_BADRPT; + } + } + + DPRINT(("tre_parse: %s star: '%.*" STRF "'\n", + minimal ? " minimal" : "greedy", REST(tmp_re))); + ctx->re++; + tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, + minimal); + if (tmp_node == NULL) + return REG_ESPACE; + result = tmp_node; + STACK_PUSHX(stack, int, PARSE_POSTFIX); + } + break; + + case CHAR_BACKSLASH: + /* "\{" is special without REG_EXTENDED */ + if (!(ctx->cflags & REG_EXTENDED) + && ctx->re + 1 < ctx->re_end + && *(ctx->re + 1) == CHAR_LBRACE) + { + ctx->re++; + goto parse_brace; + } + else + break; + + case CHAR_LBRACE: + /* "{" is literal without REG_EXTENDED */ + if (!(ctx->cflags & REG_EXTENDED)) + break; + + parse_brace: + DPRINT(("tre_parse: bound: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->re++; + + status = tre_parse_bound(ctx, &result); + if (status != REG_OK) + return status; + STACK_PUSHX(stack, int, PARSE_POSTFIX); + break; + } + break; + + case PARSE_ATOM: + /* Parse an atom. An atom is a regular expression enclosed in `()', + an empty set of `()', a bracket expression, `.', `^', `$', + a `\' followed by a character, or a single character. */ + + /* End of regexp? (empty string). */ + if (ctx->re >= ctx->re_end) + goto parse_literal; + +#ifdef REG_LITERAL + if (ctx->cflags & REG_LITERAL) + goto parse_literal; +#endif /* REG_LITERAL */ + + switch (*ctx->re) + { + case CHAR_LPAREN: /* parenthesized subexpression */ + + /* Handle "(?...)" extensions. They work in a way similar + to Perls corresponding extensions. */ + if (ctx->cflags & REG_EXTENDED + && ctx->re + 1 < ctx->re_end + && *(ctx->re + 1) == CHAR_QUESTIONMARK) + { + int new_cflags = ctx->cflags; + int bit = 1; + DPRINT(("tre_parse: extension: '%.*" STRF "\n", + REST(ctx->re))); + ctx->re += 2; + while (/*CONSTCOND*/(void)1,1) + { + if (ctx->re >= ctx->re_end) + return REG_BADPAT; + if (*ctx->re == L'i') + { + DPRINT(("tre_parse: icase: '%.*" STRF "\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_ICASE; + else + new_cflags &= ~REG_ICASE; + ctx->re++; + } + else if (*ctx->re == L'n') + { + DPRINT(("tre_parse: newline: '%.*" STRF "\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_NEWLINE; + else + new_cflags &= ~REG_NEWLINE; + ctx->re++; + } +#ifdef REG_RIGHT_ASSOC + else if (*ctx->re == L'r') + { + DPRINT(("tre_parse: right assoc: '%.*" STRF "\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_RIGHT_ASSOC; + else + new_cflags &= ~REG_RIGHT_ASSOC; + ctx->re++; + } +#endif /* REG_RIGHT_ASSOC */ +#ifdef REG_UNGREEDY + else if (*ctx->re == L'U') + { + DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_UNGREEDY; + else + new_cflags &= ~REG_UNGREEDY; + ctx->re++; + } +#endif /* REG_UNGREEDY */ + else if (*ctx->re == CHAR_MINUS) + { + DPRINT(("tre_parse: turn off: '%.*" STRF "\n", + REST(ctx->re))); + ctx->re++; + bit = 0; + } + else if (*ctx->re == CHAR_COLON) + { + DPRINT(("tre_parse: no group: '%.*" STRF "\n", + REST(ctx->re))); + ctx->re++; + depth++; + break; + } + else if (*ctx->re == CHAR_HASH) + { + DPRINT(("tre_parse: comment: '%.*" STRF "\n", + REST(ctx->re))); + /* A comment can contain any character except a + right parenthesis */ + while (ctx->re < ctx->re_end + && *ctx->re != CHAR_RPAREN) + ctx->re++; + if (ctx->re < ctx->re_end + && *ctx->re == CHAR_RPAREN) + { + ctx->re++; + break; + } + else + return REG_BADPAT; + } + else if (*ctx->re == CHAR_RPAREN) + { + ctx->re++; + break; + } + else + return REG_BADPAT; + } + + /* Turn on the cflags changes for the rest of the + enclosing group. */ + if (new_cflags != ctx->cflags) + ctx->have_inline_cflags = 1; + STACK_PUSHX(stack, int, ctx->cflags); + STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS); + STACK_PUSHX(stack, int, PARSE_RE); + ctx->cflags = new_cflags; + break; + } + + if (ctx->cflags & REG_EXTENDED + || (ctx->re > ctx->re_start + && *(ctx->re - 1) == CHAR_BACKSLASH)) + { + depth++; + if (ctx->re + 2 < ctx->re_end + && *(ctx->re + 1) == CHAR_QUESTIONMARK + && *(ctx->re + 2) == CHAR_COLON) + { + DPRINT(("tre_parse: group begin: '%.*" STRF + "', no submatch\n", REST(ctx->re))); + /* Don't mark for submatching. */ + ctx->re += 3; + STACK_PUSHX(stack, int, PARSE_RE); + } + else + { + DPRINT(("tre_parse: group begin: '%.*" STRF + "', submatch %d\n", REST(ctx->re), + ctx->submatch_id)); + ctx->re++; + /* First parse a whole RE, then mark the resulting tree + for submatching. */ + STACK_PUSHX(stack, int, ctx->submatch_id); + STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); + STACK_PUSHX(stack, int, PARSE_RE); + ctx->submatch_id++; + } + } + else + goto parse_literal; + break; + + case CHAR_RPAREN: /* end of current subexpression */ + if ((ctx->cflags & REG_EXTENDED && depth > 0) + || (!(ctx->cflags & REG_EXTENDED) && ctx->re > ctx->re_start + && *(ctx->re - 1) == CHAR_BACKSLASH)) + { + DPRINT(("tre_parse: empty: '%.*" STRF "'\n", + REST(ctx->re))); + /* We were expecting an atom, but instead the current + subexpression was closed. POSIX leaves the meaning of + this to be implementation-defined. We interpret this as + an empty expression (which matches an empty string). */ + result = tre_ast_new_literal(ctx->mem, EMPTY, -1); + if (result == NULL) + return REG_ESPACE; + if (!(ctx->cflags & REG_EXTENDED)) + ctx->re--; + } + else + goto parse_literal; + break; + + case CHAR_LBRACKET: /* bracket expression */ + DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->re++; + status = tre_parse_bracket(ctx, &result); + if (status != REG_OK) + return status; + break; + + case CHAR_BACKSLASH: + /* If this is "\(" or "\)" chew off the backslash and + try again. */ + if (!(ctx->cflags & REG_EXTENDED) + && ctx->re + 1 < ctx->re_end + && (*(ctx->re + 1) == CHAR_LPAREN + || *(ctx->re + 1) == CHAR_RPAREN)) + { + ctx->re++; + STACK_PUSHX(stack, int, PARSE_ATOM); + break; + } + + /* If a macro is used, parse the expanded macro recursively. */ + { + tre_char_t buf[64]; + tre_expand_macro(ctx->re + 1, ctx->re_end, + buf, elementsof(buf)); + if (buf[0] != 0) + { + tre_parse_ctx_t subctx; + memcpy(&subctx, ctx, sizeof(subctx)); + subctx.re = buf; + subctx.len = tre_strlen(buf); + subctx.nofirstsub = 1; + status = tre_parse(&subctx); + if (status != REG_OK) + return status; + ctx->re += 2; + result = subctx.result; + break; + } + } + + if (ctx->re + 1 >= ctx->re_end) + /* Trailing backslash. */ + return REG_EESCAPE; + +#ifdef REG_LITERAL + if (*(ctx->re + 1) == L'Q') + { + DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->cflags |= REG_LITERAL; + temporary_cflags |= REG_LITERAL; + ctx->re += 2; + STACK_PUSHX(stack, int, PARSE_ATOM); + break; + } +#endif /* REG_LITERAL */ + + DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); + ctx->re++; + switch (*ctx->re) + { + case L'b': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_WB); + ctx->re++; + break; + case L'B': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_WB_NEG); + ctx->re++; + break; + case L'<': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_BOW); + ctx->re++; + break; + case L'>': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_EOW); + ctx->re++; + break; + case L'x': + ctx->re++; + if (ctx->re >= ctx->re_end) + { + result = tre_ast_new_literal(ctx->mem, 0, 0); + if (result == NULL) + return REG_ESPACE; + break; + } + if (ctx->re[0] != CHAR_LBRACE) + { + /* 8 bit hex char. */ + char tmp[3] = {0, 0, 0}; + long val; + DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", + REST(ctx->re - 2))); + + if (ctx->re < ctx->re_end && tre_isxdigit(ctx->re[0])) + { + tmp[0] = (char)ctx->re[0]; + ctx->re++; + } + if (ctx->re < ctx->re_end && tre_isxdigit(ctx->re[0])) + { + tmp[1] = (char)ctx->re[0]; + ctx->re++; + } + val = strtol(tmp, NULL, 16); + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); + break; + } + else + { + /* Wide char. */ + char tmp[9]; /* max 8 hex digits + terminator */ + long val; + size_t i = 0; + ctx->re++; + while (ctx->re < ctx->re_end) + { + if (ctx->re[0] == CHAR_RBRACE) + break; + if (tre_isxdigit(ctx->re[0]) && i < sizeof(tmp) - 1) + { + tmp[i] = (char)ctx->re[0]; + i++; + ctx->re++; + continue; + } + return REG_EBRACE; + } + if (ctx->re >= ctx->re_end) + return REG_EBRACE; + ctx->re++; + tmp[i] = 0; + val = strtol(tmp, NULL, 16); + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); + break; + } + /*FALLTHROUGH*/ + + default: + if (tre_isdigit(*ctx->re)) + { + /* Back reference. */ + int val = *ctx->re - L'0'; + DPRINT(("tre_parse: backref: '%.*" STRF "'\n", + REST(ctx->re - 1))); + result = tre_ast_new_literal(ctx->mem, BACKREF, val); + if (result == NULL) + return REG_ESPACE; + ctx->max_backref = MAX(val, ctx->max_backref); + ctx->re++; + } + else + { + /* Escaped character. */ + DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", + REST(ctx->re - 1))); + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re); + ctx->re++; + } + break; + } + if (result == NULL) + return REG_ESPACE; + break; + + case CHAR_PERIOD: /* the any-symbol */ + DPRINT(("tre_parse: any: '%.*" STRF "'\n", + REST(ctx->re))); + if (ctx->cflags & REG_NEWLINE) + { + tre_ast_node_t *tmp1; + tre_ast_node_t *tmp2; + tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1); + if (!tmp1) + return REG_ESPACE; + tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX); + if (!tmp2) + return REG_ESPACE; + result = tre_ast_new_union(ctx->mem, tmp1, tmp2); + if (!result) + return REG_ESPACE; + } + else + { + result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX); + if (!result) + return REG_ESPACE; + } + ctx->re++; + break; + + case CHAR_CARET: /* beginning of line assertion */ + /* '^' has a special meaning everywhere in EREs, and in the + beginning of the RE and after \( is BREs. */ + if (ctx->cflags & REG_EXTENDED + || (ctx->re - 2 >= ctx->re_start + && *(ctx->re - 2) == CHAR_BACKSLASH + && *(ctx->re - 1) == CHAR_LPAREN) + || ctx->re == ctx->re_start) + { + DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", + REST(ctx->re))); + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_BOL); + if (result == NULL) + return REG_ESPACE; + ctx->re++; + } + else + goto parse_literal; + break; + + case CHAR_DOLLAR: /* end of line assertion. */ + /* '$' is special everywhere in EREs, and in the end of the + string and before \) is BREs. */ + if (ctx->cflags & REG_EXTENDED + || (ctx->re + 2 < ctx->re_end + && *(ctx->re + 1) == CHAR_BACKSLASH + && *(ctx->re + 2) == CHAR_RPAREN) + || ctx->re + 1 == ctx->re_end) + { + DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", + REST(ctx->re))); + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_EOL); + if (result == NULL) + return REG_ESPACE; + ctx->re++; + } + else + goto parse_literal; + break; + + default: + parse_literal: + + if (temporary_cflags && ctx->re + 1 < ctx->re_end + && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') + { + DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->cflags &= ~temporary_cflags; + temporary_cflags = 0; + ctx->re += 2; + STACK_PUSHX(stack, int, PARSE_PIECE); + break; + } + + + /* We are expecting an atom. If the subexpression (or the whole + regexp) ends here, we interpret it as an empty expression + (which matches an empty string). */ + if ( +#ifdef REG_LITERAL + !(ctx->cflags & REG_LITERAL) && +#endif /* REG_LITERAL */ + (ctx->re >= ctx->re_end + || *ctx->re == CHAR_STAR + || (ctx->cflags & REG_EXTENDED + && (*ctx->re == CHAR_PIPE + || *ctx->re == CHAR_LBRACE + || *ctx->re == CHAR_PLUS + || *ctx->re == CHAR_QUESTIONMARK)) + /* Test for "\)" in BRE mode. */ + || (!(ctx->cflags & REG_EXTENDED) + && ctx->re + 1 < ctx->re_end + && *ctx->re == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_LBRACE))) + { + DPRINT(("tre_parse: empty: '%.*" STRF "'\n", + REST(ctx->re))); + result = tre_ast_new_literal(ctx->mem, EMPTY, -1); + if (!result) + return REG_ESPACE; + break; + } + + DPRINT(("tre_parse: literal: '%.*" STRF "'\n", + REST(ctx->re))); + /* Note that we can't use an tre_isalpha() test here, since there + may be characters which are alphabetic but neither upper or + lower case. */ + if (ctx->cflags & REG_ICASE + && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) + { + tre_ast_node_t *tmp1; + tre_ast_node_t *tmp2; + + /* XXX - Can there be more than one opposite-case + counterpoints for some character in some locale? Or + more than two characters which all should be regarded + the same character if case is ignored? If yes, there + does not seem to be a portable way to detect it. I guess + that at least for multi-character collating elements there + could be several opposite-case counterpoints, but they + cannot be supported portably anyway. */ + tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), + tre_toupper(*ctx->re)); + if (!tmp1) + return REG_ESPACE; + tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), + tre_tolower(*ctx->re)); + if (!tmp2) + return REG_ESPACE; + result = tre_ast_new_union(ctx->mem, tmp1, tmp2); + if (!result) + return REG_ESPACE; + } + else + { + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re); + if (!result) + return REG_ESPACE; + } + ctx->re++; + break; + } + break; + + case PARSE_MARK_FOR_SUBMATCH: + { + int submatch_id = tre_stack_pop_int(stack); + + assert(result); + if (result->submatch_id >= 0) + { + tre_ast_node_t *n, *tmp_node; + n = tre_ast_new_literal(ctx->mem, EMPTY, -1); + if (n == NULL) + return REG_ESPACE; + tmp_node = tre_ast_new_catenation(ctx->mem, n, result); + if (tmp_node == NULL) + return REG_ESPACE; + tmp_node->num_submatches = result->num_submatches; + result = tmp_node; + } + result->submatch_id = submatch_id; + result->num_submatches++; + break; + } + + case PARSE_RESTORE_CFLAGS: + ctx->cflags = tre_stack_pop_int(stack); + break; + + default: + assert(0); + break; + } + } + + if (status != REG_OK) + return status; + + /* Check for missing closing parentheses. */ + if (depth > 0) + return REG_EPAREN; + + ctx->result = result; + return REG_OK; +} + +/* EOF */ diff --git a/deps/tre/lib/tre-parse.h b/deps/tre/lib/tre-parse.h new file mode 100644 index 000000000..39260ea7f --- /dev/null +++ b/deps/tre/lib/tre-parse.h @@ -0,0 +1,52 @@ +/* + tre-parse.c - Regexp parser definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifndef TRE_PARSE_H +#define TRE_PARSE_H 1 + +/* Parse context. */ +typedef struct { + /* Memory allocator. The AST is allocated using this. */ + tre_mem_t mem; + /* Stack used for keeping track of regexp syntax. */ + tre_stack_t *stack; + /* The parse result. */ + tre_ast_node_t *result; + /* The regexp to parse and its length. */ + const tre_char_t *re; + /* The first character of the entire regexp. */ + const tre_char_t *re_start; + /* The first character after the end of the regexp. */ + const tre_char_t *re_end; + size_t len; + /* Current submatch ID. */ + int submatch_id; + /* The highest back reference or -1 if none seen so far. */ + int max_backref; + /* This flag is set if the regexp uses approximate matching. */ + int have_approx; + /* This flag is set if the regexp changes cflags inline using (?...) */ + int have_inline_cflags; + /* Compilation flags. */ + int cflags; + /* If this flag is set the top-level submatch is not captured. */ + int nofirstsub; + /* The currently set approximate matching parameters. */ + int params[TRE_PARAM_LAST]; + /* the MB_CUR_MAX in use */ + int mb_cur_max; +} tre_parse_ctx_t; + +/* Parses a wide character regexp pattern into a syntax tree. This parser + handles both syntaxes (BRE and ERE), including the TRE extensions. */ +reg_errcode_t +tre_parse(tre_parse_ctx_t *ctx); + +#endif /* TRE_PARSE_H */ + +/* EOF */ diff --git a/deps/tre/lib/tre-stack.c b/deps/tre/lib/tre-stack.c new file mode 100644 index 000000000..199aaf1b7 --- /dev/null +++ b/deps/tre/lib/tre-stack.c @@ -0,0 +1,123 @@ +/* + tre-stack.c - Simple stack implementation + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#include "tre-internal.h" +#include "tre-stack.h" +#include "xmalloc.h" + +union tre_stack_item { + void *voidptr_value; + int int_value; +}; + +struct tre_stack_rec { + size_t size; + size_t max_size; + size_t ptr; + union tre_stack_item *stack; +}; + + +tre_stack_t * +tre_stack_new(size_t size, size_t max_size) +{ + tre_stack_t *s; + + s = xmalloc(sizeof(*s)); + if (s != NULL) + { + s->stack = xmalloc(sizeof(*s->stack) * size); + if (s->stack == NULL) + { + xfree(s); + return NULL; + } + s->size = size; + s->max_size = max_size; + s->ptr = 0; + } + return s; +} + +void +tre_stack_destroy(tre_stack_t *s) +{ + xfree(s->stack); + xfree(s); +} + +size_t +tre_stack_num_items(tre_stack_t *s) +{ + return s->ptr; +} + +static reg_errcode_t +tre_stack_push(tre_stack_t *s, union tre_stack_item value) +{ + if (s->ptr < s->size) + { + s->stack[s->ptr] = value; + s->ptr++; + } + else + { + if (s->size >= s->max_size) + { + DPRINT(("tre_stack_push: stack full\n")); + return REG_ESPACE; + } + else + { + union tre_stack_item *new_buffer; + size_t new_size; + DPRINT(("tre_stack_push: trying to realloc more space\n")); + new_size = s->size + s->size; + if (new_size > s->max_size) + new_size = s->max_size; + new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size); + if (new_buffer == NULL) + { + DPRINT(("tre_stack_push: realloc failed.\n")); + return REG_ESPACE; + } + DPRINT(("tre_stack_push: realloc succeeded.\n")); + assert(new_size > s->size); + s->size = new_size; + s->stack = new_buffer; + tre_stack_push(s, value); + } + } + return REG_OK; +} + +#define define_pushf(typetag, type) \ + declare_pushf(typetag, type) { \ + union tre_stack_item item; \ + item.typetag ## _value = value; \ + return tre_stack_push(s, item); \ +} + +define_pushf(int, int) +define_pushf(voidptr, void *) + +#define define_popf(typetag, type) \ + declare_popf(typetag, type) { \ + return s->stack[--s->ptr].typetag ## _value; \ + } + +define_popf(int, int) +define_popf(voidptr, void *) + +/* EOF */ diff --git a/deps/tre/lib/tre-stack.h b/deps/tre/lib/tre-stack.h new file mode 100644 index 000000000..1408f322a --- /dev/null +++ b/deps/tre/lib/tre-stack.h @@ -0,0 +1,76 @@ +/* + tre-stack.h: Stack definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + + +#ifndef TRE_STACK_H +#define TRE_STACK_H 1 + +#include "../local_includes/tre.h" + +typedef struct tre_stack_rec tre_stack_t; + +/* Creates a new stack object with initial size `size' and maximum size + `max_size'. Pushing an additional item onto a full stack will resize + the stack to double its capacity until the maximum is reached. Returns + the stack object or NULL if out of memory. */ +tre_stack_t * +tre_stack_new(size_t size, size_t max_size); + +/* Frees the stack object. */ +void +tre_stack_destroy(tre_stack_t *s); + +/* Returns the current number of items on the stack. */ +size_t +tre_stack_num_items(tre_stack_t *s); + +/* Each tre_stack_push_*(tre_stack_t *s, value) function pushes + `value' on top of stack `s'. Returns REG_ESPACE if out of memory. + This tries to realloc() more space before failing if maximum size + has not yet been reached. Returns REG_OK if successful. */ +#define declare_pushf(typetag, type) \ + reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value) + +declare_pushf(voidptr, void *); +declare_pushf(int, int); + +/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost + element off of stack `s' and returns it. The stack must not be + empty. */ +#define declare_popf(typetag, type) \ + type tre_stack_pop_ ## typetag(tre_stack_t *s) + +declare_popf(voidptr, void *); +declare_popf(int, int); + +/* Just to save some typing. */ +#define STACK_PUSH(s, typetag, value) \ + do \ + { \ + status = tre_stack_push_ ## typetag(s, value); \ + } \ + while (/*CONSTCOND*/(void)0,0) + +#define STACK_PUSHX(s, typetag, value) \ + { \ + status = tre_stack_push_ ## typetag(s, value); \ + if (status != REG_OK) \ + break; \ + } + +#define STACK_PUSHR(s, typetag, value) \ + { \ + reg_errcode_t _status; \ + _status = tre_stack_push_ ## typetag(s, value); \ + if (_status != REG_OK) \ + return _status; \ + } + +#endif /* TRE_STACK_H */ + +/* EOF */ diff --git a/deps/tre/lib/xmalloc.c b/deps/tre/lib/xmalloc.c new file mode 100644 index 000000000..637235d8d --- /dev/null +++ b/deps/tre/lib/xmalloc.c @@ -0,0 +1,362 @@ +/* + xmalloc.c - Simple malloc debugging library implementation + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + TODO: + - red zones + - group dumps by source location +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#define XMALLOC_INTERNAL 1 +#include "xmalloc.h" + + +/* + Internal stuff. +*/ + +typedef struct hashTableItemRec { + void *ptr; + size_t bytes; + const char *file; + int line; + const char *func; + struct hashTableItemRec *next; +} hashTableItem; + +typedef struct { + hashTableItem **table; +} hashTable; + +static int xmalloc_peak; +int xmalloc_current; +static int xmalloc_peak_blocks; +int xmalloc_current_blocks; +static int xmalloc_fail_after; + +#define TABLE_BITS 8 +#define TABLE_MASK ((1 << TABLE_BITS) - 1) +#define TABLE_SIZE (1 << TABLE_BITS) + +static hashTable * +hash_table_new(void) +{ + hashTable *tbl; + + tbl = malloc(sizeof(*tbl)); + + if (tbl != NULL) + { + tbl->table = calloc(TABLE_SIZE, sizeof(*tbl->table)); + + if (tbl->table == NULL) + { + free(tbl); + return NULL; + } + } + + return tbl; +} + +static unsigned int +hash_void_ptr(void *ptr) +{ + unsigned int hash; + unsigned int i; + + /* I took this hash function just off the top of my head, I have + no idea whether it is bad or very bad. */ + hash = 0; + for (i = 0; i < sizeof(ptr) * 8 / TABLE_BITS; i++) + { + hash ^= (uintptr_t)ptr >> i * 8; + hash += i * 17; + hash &= TABLE_MASK; + } + return hash; +} + +static void +hash_table_add(hashTable *tbl, void *ptr, size_t bytes, + const char *file, int line, const char *func) +{ + unsigned int i; + hashTableItem *item, *new; + + i = hash_void_ptr(ptr); + + item = tbl->table[i]; + if (item != NULL) + while (item->next != NULL) + item = item->next; + + new = malloc(sizeof(*new)); + assert(new != NULL); + new->ptr = ptr; + new->bytes = bytes; + new->file = file; + new->line = line; + new->func = func; + new->next = NULL; + if (item != NULL) + item->next = new; + else + tbl->table[i] = new; + + xmalloc_current += bytes; + if (xmalloc_current > xmalloc_peak) + xmalloc_peak = xmalloc_current; + xmalloc_current_blocks++; + if (xmalloc_current_blocks > xmalloc_peak_blocks) + xmalloc_peak_blocks = xmalloc_current_blocks; +} + +static void +#if defined(__GNUC__) && __GNUC__ >= 10 +__attribute__((access(none, 2))) +#endif +hash_table_del(hashTable *tbl, void *ptr) +{ + int i; + hashTableItem *item, *prev; + + i = hash_void_ptr(ptr); + + item = tbl->table[i]; + if (item == NULL) + { + printf("xfree: invalid ptr %p\n", ptr); + abort(); + } + prev = NULL; + while (item->ptr != ptr) + { + prev = item; + item = item->next; + } + if (item->ptr != ptr) + { + printf("xfree: invalid ptr %p\n", ptr); + abort(); + } + + xmalloc_current -= item->bytes; + xmalloc_current_blocks--; + + if (prev != NULL) + { + prev->next = item->next; + free(item); + } + else + { + tbl->table[i] = item->next; + free(item); + } +} + +static hashTable *xmalloc_table = NULL; + +static void +xmalloc_init(void) +{ + if (xmalloc_table == NULL) + { + xmalloc_table = hash_table_new(); + xmalloc_peak = 0; + xmalloc_peak_blocks = 0; + xmalloc_current = 0; + xmalloc_current_blocks = 0; + xmalloc_fail_after = -1; + } + assert(xmalloc_table != NULL); + assert(xmalloc_table->table != NULL); +} + + + +/* + Public API. +*/ + +void +xmalloc_configure(int fail_after) +{ + xmalloc_init(); + xmalloc_fail_after = fail_after; +} + +int +xmalloc_dump_leaks(void) +{ + unsigned int i; + unsigned int num_leaks = 0; + size_t leaked_bytes = 0; + hashTableItem *item; + + xmalloc_init(); + + for (i = 0; i < TABLE_SIZE; i++) + { + item = xmalloc_table->table[i]; + while (item != NULL) + { + printf("%s:%d: %s: %zu bytes at %p not freed\n", + item->file, item->line, item->func, item->bytes, item->ptr); + num_leaks++; + leaked_bytes += item->bytes; + item = item->next; + } + } + if (num_leaks == 0) + printf("No memory leaks.\n"); + else + printf("%u unfreed memory chuncks, total %zu unfreed bytes.\n", + num_leaks, leaked_bytes); + printf("Peak memory consumption %d bytes (%.1f kB, %.1f MB) in %d blocks ", + xmalloc_peak, (double)xmalloc_peak / 1024, + (double)xmalloc_peak / (1024*1024), xmalloc_peak_blocks); + printf("(average "); + if (xmalloc_peak_blocks) + printf("%d", ((xmalloc_peak + xmalloc_peak_blocks / 2) + / xmalloc_peak_blocks)); + else + printf("N/A"); + printf(" bytes per block).\n"); + + return num_leaks; +} + +void * +xmalloc_impl(size_t size, const char *file, int line, const char *func) +{ + void *ptr; + + xmalloc_init(); + assert(size > 0); + + if (xmalloc_fail_after == 0) + { + xmalloc_fail_after = -2; +#if 0 + printf("xmalloc: forced failure %s:%d: %s\n", file, line, func); +#endif + return NULL; + } + else if (xmalloc_fail_after == -2) + { + printf("xmalloc: called after failure from %s:%d: %s\n", + file, line, func); + assert(0); + } + else if (xmalloc_fail_after > 0) + xmalloc_fail_after--; + + ptr = malloc(size); + if (ptr != NULL) + hash_table_add(xmalloc_table, ptr, (int)size, file, line, func); + return ptr; +} + +void * +xcalloc_impl(size_t nmemb, size_t size, const char *file, int line, + const char *func) +{ + void *ptr; + + xmalloc_init(); + assert(size > 0); + + if (xmalloc_fail_after == 0) + { + xmalloc_fail_after = -2; +#if 0 + printf("xcalloc: forced failure %s:%d: %s\n", file, line, func); +#endif + return NULL; + } + else if (xmalloc_fail_after == -2) + { + printf("xcalloc: called after failure from %s:%d: %s\n", + file, line, func); + assert(0); + } + else if (xmalloc_fail_after > 0) + xmalloc_fail_after--; + + ptr = calloc(nmemb, size); + if (ptr != NULL) + hash_table_add(xmalloc_table, ptr, (int)(nmemb * size), file, line, func); + return ptr; +} + +void +xfree_impl(void *ptr, const char *file, int line, const char *func) +{ + /*LINTED*/(void)&file; + /*LINTED*/(void)&line; + /*LINTED*/(void)&func; + xmalloc_init(); + + if (ptr != NULL) + hash_table_del(xmalloc_table, ptr); + free(ptr); +} + +void * +xrealloc_impl(void *ptr, size_t new_size, const char *file, int line, + const char *func) +{ + void *new_ptr; + + xmalloc_init(); + assert(ptr != NULL); + assert(new_size > 0); + + if (xmalloc_fail_after == 0) + { + xmalloc_fail_after = -2; + return NULL; + } + else if (xmalloc_fail_after == -2) + { + printf("xrealloc: called after failure from %s:%d: %s\n", + file, line, func); + assert(0); + } + else if (xmalloc_fail_after > 0) + xmalloc_fail_after--; + + new_ptr = realloc(ptr, new_size); + if (new_ptr != NULL && new_ptr != ptr) + { +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuse-after-free" +#endif + hash_table_del(xmalloc_table, ptr); +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + hash_table_add(xmalloc_table, new_ptr, (int)new_size, file, line, func); + } + return new_ptr; +} + + + +/* EOF */ diff --git a/deps/tre/lib/xmalloc.h b/deps/tre/lib/xmalloc.h new file mode 100644 index 000000000..ce310af52 --- /dev/null +++ b/deps/tre/lib/xmalloc.h @@ -0,0 +1,77 @@ +/* + xmalloc.h - Simple malloc debugging library API + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifndef _XMALLOC_H +#define _XMALLOC_H 1 + +void *xmalloc_impl(size_t size, const char *file, int line, const char *func); +void *xcalloc_impl(size_t nmemb, size_t size, const char *file, int line, + const char *func); +void xfree_impl(void *ptr, const char *file, int line, const char *func); +void *xrealloc_impl(void *ptr, size_t new_size, const char *file, int line, + const char *func); +int xmalloc_dump_leaks(void); +void xmalloc_configure(int fail_after); + + +#ifndef XMALLOC_INTERNAL +#ifdef MALLOC_DEBUGGING + +/* Version 2.4 and later of GCC define a magical variable `__PRETTY_FUNCTION__' + which contains the name of the function currently being defined. +# define __XMALLOC_FUNCTION __PRETTY_FUNCTION__ + This is broken in G++ before version 2.6. + C9x has a similar variable called __func__, but prefer the GCC one since + it demangles C++ function names. */ +# ifdef __GNUC__ +# if __GNUC__ > 2 || (__GNUC__ == 2 \ + && __GNUC_MINOR__ >= (defined __cplusplus ? 6 : 4)) +# define __XMALLOC_FUNCTION __PRETTY_FUNCTION__ +# else +# define __XMALLOC_FUNCTION ((const char *) 0) +# endif +# else +# if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L +# define __XMALLOC_FUNCTION __func__ +# else +# define __XMALLOC_FUNCTION ((const char *) 0) +# endif +# endif + +#define xmalloc(size) xmalloc_impl(size, __FILE__, __LINE__, \ + __XMALLOC_FUNCTION) +#define xcalloc(nmemb, size) xcalloc_impl(nmemb, size, __FILE__, __LINE__, \ + __XMALLOC_FUNCTION) +#define xfree(ptr) xfree_impl(ptr, __FILE__, __LINE__, __XMALLOC_FUNCTION) +#define xrealloc(ptr, new_size) xrealloc_impl(ptr, new_size, __FILE__, \ + __LINE__, __XMALLOC_FUNCTION) +#undef malloc +#undef calloc +#undef free +#undef realloc + +#define malloc USE_XMALLOC_INSTEAD_OF_MALLOC +#define calloc USE_XCALLOC_INSTEAD_OF_CALLOC +#define free USE_XFREE_INSTEAD_OF_FREE +#define realloc USE_XREALLOC_INSTEAD_OF_REALLOC + +#else /* !MALLOC_DEBUGGING */ + +#include + +#define xmalloc(size) malloc(size) +#define xcalloc(nmemb, size) calloc(nmemb, size) +#define xfree(ptr) free(ptr) +#define xrealloc(ptr, new_size) realloc(ptr, new_size) + +#endif /* !MALLOC_DEBUGGING */ +#endif /* !XMALLOC_INTERNAL */ + +#endif /* _XMALLOC_H */ + +/* EOF */ diff --git a/deps/tre/local_includes/regex.h b/deps/tre/local_includes/regex.h new file mode 100644 index 000000000..daa15a741 --- /dev/null +++ b/deps/tre/local_includes/regex.h @@ -0,0 +1,48 @@ +/* + regex.h - TRE legacy API + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + + This header is for source level compatibility with old code using + the header which defined the TRE API functions without + a prefix. New code should include instead. + +*/ + +#ifndef TRE_REXEX_H +#define TRE_REGEX_H 1 + +#ifdef USE_LOCAL_TRE_H +/* Use the header(s) from the TRE package that this file is part of. + (Yes, this file is in local_include too, but the explict path + means there is no way to get a system tre.h by accident.) */ +#include "../local_includes/tre.h" +#else +/* Use the header(s) from an installed version of the TRE package + (so that this application matches the installed libtre), + not the one(s) in the local_includes directory. */ +#include +#endif + +#ifndef TRE_USE_SYSTEM_REGEX_H +#define regcomp tre_regcomp +#define regerror tre_regerror +#define regexec tre_regexec +#define regfree tre_regfree +#endif /* TRE_USE_SYSTEM_REGEX_H */ + +#define regacomp tre_regacomp +#define regaexec tre_regaexec +#define regancomp tre_regancomp +#define reganexec tre_reganexec +#define regawncomp tre_regawncomp +#define regawnexec tre_regawnexec +#define regncomp tre_regncomp +#define regnexec tre_regnexec +#define regwcomp tre_regwcomp +#define regwexec tre_regwexec +#define regwncomp tre_regwncomp +#define regwnexec tre_regwnexec + +#endif /* TRE_REGEX_H */ diff --git a/deps/tre/local_includes/tre-config.h b/deps/tre/local_includes/tre-config.h new file mode 100644 index 000000000..4b73c1289 --- /dev/null +++ b/deps/tre/local_includes/tre-config.h @@ -0,0 +1,14 @@ +/* Minimal TRE configuration for Redis. + * + * We use TRE as a byte-oriented regex matcher for ARGREP. Redis SDS values are + * binary-safe byte strings, so we intentionally keep the dependency build + * simple: no wide-char path, no multibyte locale handling, and no approximate + * matching engine. + */ + +#define HAVE_SYS_TYPES_H 1 + +#define TRE_VERSION "redis-vendored" +#define TRE_VERSION_1 0 +#define TRE_VERSION_2 0 +#define TRE_VERSION_3 0 diff --git a/deps/tre/local_includes/tre.h b/deps/tre/local_includes/tre.h new file mode 100644 index 000000000..675153990 --- /dev/null +++ b/deps/tre/local_includes/tre.h @@ -0,0 +1,344 @@ +/* + tre.h - TRE public API definitions + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifndef TRE_H +#define TRE_H 1 + +#ifdef USE_LOCAL_TRE_H +/* Make certain to use the header(s) from the TRE package that this + file is part of by giving the full path to the header from this directory. */ +#include "../local_includes/tre-config.h" +#else +/* Use the header in the same directory as this file if there is one. */ +#include "tre-config.h" +#endif + +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ + +#ifdef HAVE_LIBUTF8_H +#include +#endif /* HAVE_LIBUTF8_H */ + +#ifdef TRE_USE_SYSTEM_REGEX_H +/* Include the system regex.h to make TRE ABI compatible with the + system regex. */ +#include TRE_SYSTEM_REGEX_H_PATH +#define tre_regcomp regcomp +#define tre_regexec regexec +#define tre_regerror regerror +#define tre_regfree regfree +/* The GNU C regex has a number of refinements to the POSIX standard for the + formal parameter list of the regexec() function, and some of those fail to + compile when using LLVM. The refinements seem to be opt-out rather than + opt-in when using a recent gcc, and they produce a warning when TRE tries + to mimic the API without the refinements. The TRE code still works but + the warnings are distracting, so try to #define a flag to indicate when to + add the refinements to TRE's parameter list too. */ +#ifdef __GNUC__ +/* Try to test something that looks pretty REGEX specific and hope we don't + need a zillion different platform+compiler specific tests to deal with this. */ +#ifdef _REGEX_NELTS +/* Define a TRE specific flag here so that: + 1) there is only one place where code has to be changed if the test above is not adequate, and + 2) the flag can be used in any other parts of the TRE source that might be affected by the + GNUC refinements. + Note that this flag is only defined when all of TRE_USE_SYSTEM_REGEX_H, __GNUC__, and _REGEX_NELTS are defined. */ +#define TRE_USE_GNUC_REGEXEC_FPL 1 +#endif +#endif +#endif /* TRE_USE_SYSTEM_REGEX_H */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef TRE_USE_SYSTEM_REGEX_H + +#ifndef REG_OK +#define REG_OK 0 +#endif /* !REG_OK */ + +#ifndef HAVE_REG_ERRCODE_T +typedef int reg_errcode_t; +#endif /* !HAVE_REG_ERRCODE_T */ + +#if !defined(REG_NOSPEC) && !defined(REG_LITERAL) +#define REG_LITERAL 0x1000 +#endif + +/* Extra tre_regcomp() return error codes. */ +#define REG_BADMAX REG_BADBR + +/* Extra tre_regcomp() flags. */ +#ifndef REG_BASIC +#define REG_BASIC 0 +#endif /* !REG_BASIC */ +#define REG_RIGHT_ASSOC (REG_LITERAL << 1) +#ifdef REG_UNGREEDY +/* We're going to use TRE code, so we need the TRE define (dodge problem in MacOS). */ +#undef REG_UNGREEDY +#endif +#define REG_UNGREEDY (REG_RIGHT_ASSOC << 1) + +#define REG_USEBYTES (REG_UNGREEDY << 1) + +/* Extra tre_regexec() flags. */ +#define REG_APPROX_MATCHER 0x1000 +#ifdef REG_BACKTRACKING_MATCHER +/* We're going to use TRE code, so we need the TRE define (dodge problem in MacOS). */ +#undef REG_BACKTRACKING_MATCHER +#endif +#define REG_BACKTRACKING_MATCHER (REG_APPROX_MATCHER << 1) + +#else /* !TRE_USE_SYSTEM_REGEX_H */ + +/* If the we're not using system regex.h, we need to define the + structs and enums ourselves. */ + +typedef int regoff_t; +typedef struct { + size_t re_nsub; /* Number of parenthesized subexpressions. */ + void *value; /* For internal use only. */ +} regex_t; + +typedef struct { + regoff_t rm_so; + regoff_t rm_eo; +} regmatch_t; + + +typedef enum { + REG_OK = 0, /* No error. */ + /* POSIX tre_regcomp() return error codes. (In the order listed in the + standard.) */ + REG_NOMATCH, /* No match. */ + REG_BADPAT, /* Invalid regexp. */ + REG_ECOLLATE, /* Unknown collating element. */ + REG_ECTYPE, /* Unknown character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* "[]" imbalance */ + REG_EPAREN, /* "\(\)" or "()" imbalance */ + REG_EBRACE, /* "\{\}" or "{}" imbalance */ + REG_BADBR, /* Invalid content of {} */ + REG_ERANGE, /* Invalid use of range operator */ + REG_ESPACE, /* Out of memory. */ + REG_BADRPT, /* Invalid use of repetition operators. */ + REG_BADMAX, /* Maximum repetition in {} too large */ +} reg_errcode_t; + +/* POSIX tre_regcomp() flags. */ +#define REG_EXTENDED 1 +#define REG_ICASE (REG_EXTENDED << 1) +#define REG_NEWLINE (REG_ICASE << 1) +#define REG_NOSUB (REG_NEWLINE << 1) + +/* Extra tre_regcomp() flags. */ +#define REG_BASIC 0 +#define REG_LITERAL (REG_NOSUB << 1) +#define REG_RIGHT_ASSOC (REG_LITERAL << 1) +#define REG_UNGREEDY (REG_RIGHT_ASSOC << 1) +#define REG_USEBYTES (REG_UNGREEDY << 1) + +/* POSIX tre_regexec() flags. */ +#define REG_NOTBOL 1 +#define REG_NOTEOL (REG_NOTBOL << 1) + +/* Extra tre_regexec() flags. */ +#define REG_APPROX_MATCHER (REG_NOTEOL << 1) +#define REG_BACKTRACKING_MATCHER (REG_APPROX_MATCHER << 1) + +#endif /* !TRE_USE_SYSTEM_REGEX_H */ + +/* REG_NOSPEC and REG_LITERAL mean the same thing. */ +#if defined(REG_LITERAL) && !defined(REG_NOSPEC) +#define REG_NOSPEC REG_LITERAL +#elif defined(REG_NOSPEC) && !defined(REG_LITERAL) +#define REG_LITERAL REG_NOSPEC +#endif /* defined(REG_NOSPEC) */ + +/* The maximum number of iterations in a bound expression. */ +#undef RE_DUP_MAX +#define RE_DUP_MAX 255 + +/* The POSIX.2 regexp functions */ +extern int +tre_regcomp(regex_t *preg, const char *regex, int cflags); + +#ifdef TRE_USE_GNUC_REGEXEC_FPL +extern int +tre_regexec(const regex_t *preg, const char *string, + size_t nmatch, regmatch_t pmatch[_Restrict_arr_ _REGEX_NELTS (nmatch)], + int eflags); +#else +extern int +tre_regexec(const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags); +#endif + +extern int +tre_regcompb(regex_t *preg, const char *regex, int cflags); + +extern int +tre_regexecb(const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags); + +extern size_t +tre_regerror(int errcode, const regex_t *preg, char *errbuf, + size_t errbuf_size); + +extern void +tre_regfree(regex_t *preg); + +#ifdef TRE_WCHAR +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ + +/* Wide character versions (not in POSIX.2). */ +extern int +tre_regwcomp(regex_t *preg, const wchar_t *regex, int cflags); + +extern int +tre_regwexec(const regex_t *preg, const wchar_t *string, + size_t nmatch, regmatch_t pmatch[], int eflags); +#endif /* TRE_WCHAR */ + +/* Versions with a maximum length argument and therefore the capability to + handle null characters in the middle of the strings (not in POSIX.2). */ +extern int +tre_regncomp(regex_t *preg, const char *regex, size_t len, int cflags); + +extern int +tre_regnexec(const regex_t *preg, const char *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); + +/* regn*b versions take byte literally as 8-bit values */ +extern int +tre_regncompb(regex_t *preg, const char *regex, size_t n, int cflags); + +extern int +tre_regnexecb(const regex_t *preg, const char *str, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); + +#ifdef TRE_WCHAR +extern int +tre_regwncomp(regex_t *preg, const wchar_t *regex, size_t len, int cflags); + +extern int +tre_regwnexec(const regex_t *preg, const wchar_t *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); +#endif /* TRE_WCHAR */ + +#ifdef TRE_APPROX + +/* Approximate matching parameter struct. */ +typedef struct { + int cost_ins; /* Default cost of an inserted character. */ + int cost_del; /* Default cost of a deleted character. */ + int cost_subst; /* Default cost of a substituted character. */ + int max_cost; /* Maximum allowed cost of a match. */ + + int max_ins; /* Maximum allowed number of inserts. */ + int max_del; /* Maximum allowed number of deletes. */ + int max_subst; /* Maximum allowed number of substitutes. */ + int max_err; /* Maximum allowed number of errors total. */ +} regaparams_t; + +/* Approximate matching result struct. */ +typedef struct { + size_t nmatch; /* Length of pmatch[] array. */ + regmatch_t *pmatch; /* Submatch data. */ + int cost; /* Cost of the match. */ + int num_ins; /* Number of inserts in the match. */ + int num_del; /* Number of deletes in the match. */ + int num_subst; /* Number of substitutes in the match. */ +} regamatch_t; + + +/* Approximate matching functions. */ +extern int +tre_regaexec(const regex_t *preg, const char *string, + regamatch_t *match, regaparams_t params, int eflags); + +extern int +tre_reganexec(const regex_t *preg, const char *string, size_t len, + regamatch_t *match, regaparams_t params, int eflags); + +extern int +tre_regaexecb(const regex_t *preg, const char *string, + regamatch_t *match, regaparams_t params, int eflags); + +#ifdef TRE_WCHAR +/* Wide character approximate matching. */ +extern int +tre_regawexec(const regex_t *preg, const wchar_t *string, + regamatch_t *match, regaparams_t params, int eflags); + +extern int +tre_regawnexec(const regex_t *preg, const wchar_t *string, size_t len, + regamatch_t *match, regaparams_t params, int eflags); +#endif /* TRE_WCHAR */ + +/* Sets the parameters to default values. */ +extern void +tre_regaparams_default(regaparams_t *params); +#endif /* TRE_APPROX */ + +#ifdef TRE_WCHAR +typedef wchar_t tre_char_t; +#else /* !TRE_WCHAR */ +typedef unsigned char tre_char_t; +#endif /* !TRE_WCHAR */ + +typedef struct { + int (*get_next_char)(tre_char_t *c, unsigned int *pos_add, void *context); + void (*rewind)(size_t pos, void *context); + int (*compare)(size_t pos1, size_t pos2, size_t len, void *context); + void *context; +} tre_str_source; + +extern int +tre_reguexec(const regex_t *preg, const tre_str_source *string, + size_t nmatch, regmatch_t pmatch[], int eflags); + +/* Returns the version string. The returned string is static. */ +extern char * +tre_version(void); + +/* Returns the value for a config parameter. The type to which `result' + must point to depends of the value of `query', see documentation for + more details. */ +extern int +tre_config(int query, void *result); + +enum { + TRE_CONFIG_APPROX, + TRE_CONFIG_WCHAR, + TRE_CONFIG_MULTIBYTE, + TRE_CONFIG_SYSTEM_ABI, + TRE_CONFIG_VERSION +}; + +/* Returns 1 if the compiled pattern has back references, 0 if not. */ +extern int +tre_have_backrefs(const regex_t *preg); + +/* Returns 1 if the compiled pattern uses approximate matching features, + 0 if not. */ +extern int +tre_have_approx(const regex_t *preg); + +#ifdef __cplusplus +} +#endif +#endif /* TRE_H */ + +/* EOF */ diff --git a/deps/tre/tests/retest.c b/deps/tre/tests/retest.c new file mode 100644 index 000000000..c486a819c --- /dev/null +++ b/deps/tre/tests/retest.c @@ -0,0 +1,1871 @@ +/* + retest.c - TRE regression test program + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + This is just a simple test application containing various hands-written + tests for regression testing TRE. I've tried to surround TRE specific + tests inside ifdefs, so this can be used to test any POSIX compatible + regexp implementation. +*/ + +/* + 2023/06 - Compilers now sometimes require the input string constants to be + properly encoded, but how they decide on which encoding (if any) + is poorly documented and different for different platforms. + The non-ASCII encoded strings are now guarded by #ifdefs with one + of the following values. Define/undef whichever one(s) you need. + #define SRC_IN_ISO_8859_1 + #define SRC_IN_UTF_8 + #define SRC_IN_EUC_JP + */ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +/* look for getopt in order to use a -o option for output. */ +#if defined(HAVE_UNISTD_H) +#include +#elif defined(HAVE_GETOPT_H) +#include +#endif +#ifdef HAVE_MALLOC_H +#include +#endif /* HAVE_MALLOC_H */ + +#ifdef TRE_VERSION +#define HAVE_REGNEXEC 1 +#define HAVE_REGNCOMP 1 +#include "xmalloc.h" +#else /* !TRE_VERSION */ +#define xmalloc malloc +#define xfree free +#endif /* !TRE_VERSION */ + +#include "tre-internal.h" + +#ifdef WRETEST +#include +#define CHAR_T wchar_t +#define L(x) (L ## x) + +#define MAXSTRSIZE 8192 +static wchar_t wstr[MAXSTRSIZE]; +static wchar_t wregex[MAXSTRSIZE]; +static int woffs[MAXSTRSIZE]; + +#ifdef TRE_USE_SYSTEM_REGEX_H +/* Avoid some redefinition warnings from including tre.h. */ +#ifdef tre_regexec +#undef tre_regexec +/* No need for the *n* fn, it isn't in the system abi. */ +#endif +#endif +#define tre_regexec tre_regwexec +#define tre_regnexec tre_regwnexec +#ifdef TRE_USE_SYSTEM_REGEX_H +/* Avoid some redefinition warnings from including tre.h. */ +#ifdef tre_regcomp +#undef tre_regcomp +#endif +/* No need for the *n* fn, it isn't in the system abi. */ +#endif +#define tre_regcomp tre_regwcomp +#define tre_regncomp tre_regwncomp + +/* Iterate mbrtowc over the multi-byte sequence STR of length LEN, + store the result in BUF and memoize the successive byte offsets + in OFF. */ + +static int +mbntowc (wchar_t *buf, const char *str, size_t len, int *off) +{ + int n, wlen; +#ifdef HAVE_MBSTATE_T + mbstate_t cst; + memset(&cst, 0, sizeof(cst)); +#endif + + if (len >= MAXSTRSIZE) + { + fprintf(stderr, "Increase MAXSTRSIZE to %ld or more and recompile!\n", + (long)len + 1); + exit(EXIT_FAILURE); + } + + if (off) + { + memset(off + 1, -1, len * sizeof(int)); + *off = 0; + } + + wlen = 0; + while (len > 0) + { + n = tre_mbrtowc(buf ? buf++ : NULL, str, len, &cst); + if (n < 0) + return n; + if (n == 0) + n = 1; + str += n; + len -= n; + wlen += 1; + if (off) + *(off += n) = wlen; + } + + return(wlen); +} + +#else /* !WRETEST */ +#define CHAR_T char +#define L(x) (x) +#endif /* !WRETEST */ + +static FILE *outf = NULL; + +static int valid_reobj = 0; +static regex_t reobj; +static regmatch_t pmatch_global[32]; +static const CHAR_T *regex_pattern; +static int cflags_global; +static int use_regnexec = 0; +static int use_regncomp = 0; +static int avoid_eflags = 0; + +static int comp_tests = 0; +static int exec_tests = 0; +static int comp_errors = 0; +static int exec_errors = 0; + +#ifndef REG_OK +#define REG_OK 0 +#endif /* REG_OK */ + +#define END -2 + +static void +test_status(char c) +{ + static int k = 0; + fprintf(outf, "%c", c); + if (++k % 79 == 0) + fprintf(outf, "\n"); + fflush(outf); +} + + +static int +wrap_regexec(const CHAR_T *data, size_t len, + size_t pmatch_len, regmatch_t *pmatch, int eflags) +{ + CHAR_T *buf = NULL; + int result; + + if (len == 0 && use_regnexec) + { + /* Zero length string and using tre_regnexec(), the pointer we give + should not be dereferenced at all. */ + buf = NULL; + } + else + { + /* Copy the data to a separate buffer to make a better test for + tre_regexec() and tre_regnexec(). */ + buf = xmalloc((len + !use_regnexec) * sizeof(CHAR_T)); + if (!buf) + return REG_ESPACE; + memcpy(buf, data, len * sizeof(CHAR_T)); + test_status('#'); + } + +#ifdef HAVE_REGNEXEC + if (use_regnexec) + { + if (len == 0) + result = tre_regnexec(&reobj, NULL, len, pmatch_len, pmatch, eflags); + else + result = tre_regnexec(&reobj, buf, len, pmatch_len, pmatch, eflags); + } + else +#endif /* HAVE_REGNEXEC */ + { + buf[len] = L('\0'); + result = tre_regexec(&reobj, buf, pmatch_len, pmatch, eflags); + } + + xfree(buf); + return result; +} + +static int +wrap_regcomp(regex_t *preg, const CHAR_T *data, size_t len, int cflags) +{ +#ifdef HAVE_REGNCOMP + if (use_regncomp) + return tre_regncomp(preg, data, len, cflags); + else + return tre_regcomp(preg, data, cflags); +#else /* !HAVE_REGNCOMP */ + fprintf(stderr, "%s\n", data); + return tre_regcomp(preg, data, cflags); +#endif /* !HAVE_REGNCOMP */ +} + +static int +execute(const CHAR_T *data, int len, size_t pmatch_len, regmatch_t *pmatch, + int eflags) +{ +#ifdef MALLOC_DEBUGGING + int i = 0; + int ret; + + while (1) + { + xmalloc_configure(i); + comp_tests++; + ret = wrap_regexec(data, len, pmatch_len, pmatch, eflags); + if (ret != REG_ESPACE) + { + break; + } +#ifdef REGEX_DEBUG + xmalloc_dump_leaks(); +#endif /* REGEX_DEBUG */ + i++; + } + return ret; +#else /* !MALLOC_DEBUGGING */ + return wrap_regexec(data, len, pmatch_len, pmatch, eflags); +#endif /* !MALLOC_DEBUGGING */ +} + +static int +check(va_list ap, int ret, const CHAR_T *str, + size_t pmatch_len, regmatch_t *pmatch, int eflags) +{ + int fail = 0; + + if (ret != va_arg(ap, int)) + { +#ifndef WRETEST + fprintf(outf, "Exec error, regex: \"%s\", cflags %d, " + "string: \"%s\", eflags %d\n", regex_pattern, cflags_global, + str, eflags); +#else /* WRETEST */ + fprintf(outf, "Exec error, regex: \"%ls\", cflags %d, " + "string: \"%ls\", eflags %d\n", regex_pattern, cflags_global, + str, eflags); +#endif /* WRETEST */ + fprintf(outf, " got %smatch (tre_regexec returned %d)\n", ret ? "no " : "", ret); + return 1; + } + + if (ret == 0) + { + unsigned int i; + + for (i = 0; i < pmatch_len; i++) + { + int rm_so, rm_eo; + rm_so = va_arg(ap, int); + if (rm_so == END) + break; + rm_eo = va_arg(ap, int); +#ifdef WRETEST + if (rm_so >= 0) + { + int n = rm_so; + + if ((rm_so = woffs[rm_so]) < 0 || + (n = rm_eo, rm_eo = woffs[rm_eo]) < 0) + { + fprintf(outf, "Invalid or incomplete multi-byte sequence " + "in string %ls before byte offset %d\n", str, n); + return 1; + } + } +#endif /* WRETEST */ + if (pmatch[i].rm_so != rm_so + || pmatch[i].rm_eo != rm_eo) + { +#ifndef WRETEST + fprintf(outf, "Exec error, regex: \"%s\", string: \"%s\"\n", + regex_pattern, str); + fprintf(outf, " group %d: expected (%d, %d) \"%.*s\", " + "got (%d, %d) \"%.*s\"\n", +#else /* WRETEST */ + fprintf(outf, "Exec error, regex: \"%ls\", string: \"%ls\"\n", + regex_pattern, str); + fprintf(outf, " group %d: expected (%d, %d) \"%.*ls\", " + "got (%d, %d) \"%.*ls\"\n", +#endif /* WRETEST */ + i, rm_so, rm_eo, rm_eo - rm_so, str + rm_so, + (int)pmatch[i].rm_so, (int)pmatch[i].rm_eo, + (int)(pmatch[i].rm_eo - pmatch[i].rm_so), + str + pmatch[i].rm_so); + fail = 1; + } + } + + if (!(cflags_global & REG_NOSUB) && reobj.re_nsub != i - 1 + && reobj.re_nsub <= pmatch_len && pmatch) + { +#ifndef WRETEST + fprintf(outf, "Comp error, regex: \"%s\"\n", regex_pattern); +#else /* WRETEST */ + fprintf(outf, "Comp error, regex: \"%ls\"\n", regex_pattern); +#endif /* WRETEST */ + fprintf(outf, " re_nsub is %d, should be %d\n", (int)reobj.re_nsub, i - 1); + fail = 1; + } + + + for (; i < pmatch_len; i++) + if (pmatch[i].rm_so != -1 || pmatch[i].rm_eo != -1) + { + if (!fail) +#ifndef WRETEST + fprintf(outf, "Exec error, regex: \"%s\", string: \"%s\"\n", + regex_pattern, str); +#else /* WRETEST */ + fprintf(outf, "Exec error, regex: \"%ls\", string: \"%ls\"\n", + regex_pattern, str); +#endif /* WRETEST */ + fprintf(outf, " group %d: expected (-1, -1), got (%d, %d)\n", + i, (int)pmatch[i].rm_so, (int)pmatch[i].rm_eo); + fail = 1; + } + } + + return fail; +} + + +static void +test_nexec(const char *data, size_t len, int eflags, ...) +{ + int m; + int fail = 0; + int extra_flags[] = {0, REG_BACKTRACKING_MATCHER, REG_APPROX_MATCHER}; + size_t i; + va_list ap; + + if (!valid_reobj) + { + exec_errors++; + return; + } + +#ifdef WRETEST + { + int wlen = mbntowc(wstr, data, len, woffs); + if (wlen < 0) + { + exec_errors++; + fprintf(outf, "Invalid or incomplete multi-byte sequence in %s\n", data); + return; + } + wstr[wlen] = L'\0'; + len = wlen; + } +#define data wstr +#endif /* WRETEST */ + + use_regnexec = 1; + + for (i = 0; i < elementsof(extra_flags); i++) + { + int final_flags = eflags | extra_flags[i]; + + if ((final_flags & REG_BACKTRACKING_MATCHER + && tre_have_approx(&reobj)) + || (final_flags & REG_APPROX_MATCHER + && tre_have_backrefs(&reobj)) + || (final_flags & avoid_eflags)) + continue; + + /* Test with a pmatch array. */ + exec_tests++; + m = execute(data, len, elementsof(pmatch_global), pmatch_global, + final_flags); + va_start(ap, eflags); + fail |= check(ap, m, data, elementsof(pmatch_global), pmatch_global, + final_flags); + va_end(ap); + + /* Same test with a NULL pmatch. */ + exec_tests++; + m = execute(data, len, 0, NULL, final_flags); + va_start(ap, eflags); + fail |= check(ap, m, data, 0, NULL, final_flags); + va_end(ap); + } + +#ifdef WRETEST +#undef data +#endif /* WRETEST */ + + if (fail) + exec_errors++; +} + + + +static void +test_exec(const char *str, int eflags, ...) +{ + int m; + int fail = 0; + size_t len = strlen(str); + int extra_flags[] = {0, + REG_BACKTRACKING_MATCHER, + REG_APPROX_MATCHER, + REG_BACKTRACKING_MATCHER | REG_APPROX_MATCHER}; + size_t i; + va_list ap; + + if (!valid_reobj) + { + exec_errors++; + return; + } + +#ifdef WRETEST + { + int wlen = mbntowc(wstr, str, len, woffs); + if (wlen < 0) + { + exec_errors++; + fprintf(outf, "Invalid or incomplete multi-byte sequence in %s\n", str); + return; + } + wstr[wlen] = L'\0'; + len = wlen; + } +#define str wstr +#endif /* WRETEST */ + + for (use_regnexec = 0; use_regnexec < 2; use_regnexec++) + { + for (i = 0; i < elementsof(extra_flags); i++) + { + int final_flags = eflags | extra_flags[i]; + + if ((final_flags & REG_BACKTRACKING_MATCHER + && tre_have_approx(&reobj)) + || (final_flags & REG_APPROX_MATCHER + && tre_have_backrefs(&reobj)) + || (final_flags & avoid_eflags)) + continue; + + /* Test with a pmatch array. */ + exec_tests++; + m = execute(str, len, elementsof(pmatch_global), pmatch_global, + final_flags); + va_start(ap, eflags); + fail |= check(ap, m, str, elementsof(pmatch_global), pmatch_global, + final_flags); + va_end(ap); + + /* Same test with a NULL pmatch. */ + exec_tests++; + m = execute(str, len, 0, NULL, final_flags); + va_start(ap, eflags); + fail |= check(ap, m, str, 0, NULL, final_flags); + va_end(ap); + } + } + +#ifdef WRETEST +#undef str +#endif /* WRETEST */ + + if (fail) + exec_errors++; +} + + +static void +test_comp(const char *re, int flags, int ret) +{ + int errcode = 0; + int len = re ? strlen(re) : 0; + + if (valid_reobj) + { + tre_regfree(&reobj); + valid_reobj = 0; + } + + comp_tests++; + +#ifdef WRETEST + { + int wlen = mbntowc(wregex, re, len, NULL); + + if (wlen < 0) + { + comp_errors++; + fprintf(outf, "Invalid or incomplete multi-byte sequence in %s\n", re); + return; + } + wregex[wlen] = L'\0'; + len = wlen; + } +#define re wregex +#endif /* WRETEST */ + regex_pattern = re; + cflags_global = flags; + +#ifdef MALLOC_DEBUGGING + xmalloc_configure(-1); + if (ret != REG_ESPACE) { + static int j = 0; + int i = 0; + while (1) + { + xmalloc_configure(i); + comp_tests++; + if (j++ % 20 == 0) + test_status('.'); + errcode = wrap_regcomp(&reobj, re, len, flags); + if (errcode != REG_ESPACE) + { + test_status('*'); + break; + } +#ifdef REGEX_DEBUG + xmalloc_dump_leaks(); +#endif /* REGEX_DEBUG */ + i++; + } + } else +#endif /* !MALLOC_DEBUGGING */ + errcode = wrap_regcomp(&reobj, re, len, flags); + +#ifdef WRETEST +#undef re +#endif /* WRETEST */ + + if (errcode != ret) + { +#ifndef WRETEST + fprintf(outf, "Comp error, regex: \"%s\"\n", regex_pattern); +#else /* WRETEST */ + fprintf(outf, "Comp error, regex: \"%ls\"\n", regex_pattern); +#endif /* WRETEST */ + fprintf(outf, " expected return code %d, got %d.\n", + ret, errcode); + comp_errors++; + } + + if (errcode == 0) + valid_reobj = 1; +} + + + +/* To enable tests for known bugs, set this to 1. */ +#define KNOWN_BUG 0 + +int +main(int argc, char **argv) +{ + outf = stdout; +#if defined(HAVE_UNISTD_H) || defined(HAVE_GETOPT_H) + int opt; + while ((opt = getopt(argc, argv, "o:")) != EOF) + { + switch (opt) + { + case 'o': + if ((outf = fopen(optarg, "w")) == NULL) + { + perror(optarg); + exit(1); + } + break; + default: + /* getopt() will have printed an error message already */ + exit(1); + } + } +#endif /* HAVE_UNISTD_H */ + +#ifdef WRETEST + /* Need an 8-bit locale. Or move the two tests with non-ascii + characters to the localized internationalization tests. */ + if (setlocale(LC_CTYPE, "en_US.ISO-8859-1") == NULL && + setlocale(LC_CTYPE, "en_US.ISO8859-1") == NULL) + fprintf(stderr, "Could not set locale en_US.ISO-8859-1. Expect some\n" + "`Invalid or incomplete multi-byte sequence' errors.\n"); +#endif /* WRETEST */ + /* Large number of macros in one regexp. */ + test_comp("[A-Z]\\d\\s?\\d[A-Z]{2}|[A-Z]\\d{2}\\s?\\d[A-Z]{2}|[A-Z]{2}\\d" + "\\s?\\d[A-Z]{2}|[A-Z]{2}\\d{2}\\s?\\d[A-Z]{2}|[A-Z]\\d[A-Z]\\s?" + "\\d[A-Z]{2}|[A-Z]{2}\\d[A-Z]\\s?\\d[A-Z]{2}|[A-Z]{3}\\s?\\d[A-Z]" + "{2}", REG_EXTENDED, 0); + + test_comp("a{11}(b{2}c){2}", REG_EXTENDED, 0); + test_comp("a{2}{2}xb+xc*xd?x", REG_EXTENDED, 0); + test_comp("^!packet [0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3} [0-9]+", + REG_EXTENDED, 0); + test_comp("^!pfast [0-9]{1,15} ([0-9]{1,3}\\.){3}[0-9]{1,3}[0-9]{1,5}$", + REG_EXTENDED, 0); + +#if KNOWN_BUG + /* Should these match or not? */ + test_comp("(a)*-\\1b", REG_EXTENDED, 0); + test_exec("aaa-b", 0, REG_NOMATCH); + test_comp("((.*)\\1)+", REG_EXTENDED, 0); + test_exec("xxxxxx", 0, REG_NOMATCH); +#endif + +#ifdef TRE_APPROX + /* + * Approximate matching tests. + * + * The approximate matcher always searches for the best match, and returns + * the leftmost and longest one if there are several best matches. + */ + + test_comp("(fou){# ~1}", REG_EXTENDED, 0); + test_comp("(fuu){#}", REG_EXTENDED, 0); + test_comp("(fuu){# ~}", REG_EXTENDED, 0); + test_comp("(anaconda){ 1i + 1d < 1, #1}", REG_EXTENDED, 0); + test_comp("(anaconda){ 1i + 1d < 1 #1 ~10 }", REG_EXTENDED, 0); + test_comp("(anaconda){ #1, ~1, 1i + 1d < 1 }", REG_EXTENDED, 0); + + test_comp("(znacnda){ #1 ~3 1i + 1d < 1 }", REG_EXTENDED, 0); + test_exec("molasses anaconda foo bar baz smith anderson ", + 0, REG_NOMATCH); + test_comp("(znacnda){ #1 ~3 1i + 1d < 2 }", REG_EXTENDED, 0); + test_exec("molasses anaconda foo bar baz smith anderson ", + 0, REG_OK, 9, 17, 9, 17, END); + test_comp("(ananda){ 1i + 1d < 2 }", REG_EXTENDED, 0); + test_exec("molasses anaconda foo bar baz smith anderson ", + 0, REG_NOMATCH); + + test_comp("(fuu){ +3 -3 ~5}", REG_EXTENDED, 0); + test_exec("anaconda foo bar baz smith anderson", + 0, REG_OK, 9, 10, 9, 10, END); + test_comp("(fuu){ +2 -2 ~5}", REG_EXTENDED, 0); + test_exec("anaconda foo bar baz smith anderson", + 0, REG_OK, 9, 10, 9, 10, END); + test_comp("(fuu){ +3 -3 ~}", REG_EXTENDED, 0); + test_exec("anaconda foo bar baz smith anderson", + 0, REG_OK, 9, 10, 9, 10, END); + + test_comp("(laurikari){ #3, 1i + 1d < 3 }", REG_EXTENDED, 0); + + /* No cost limit. */ + test_comp("(foobar){~}", REG_EXTENDED, 0); + test_exec("xirefoabralfobarxie", 0, REG_OK, 11, 16, 11, 16, END); + + /* At most two errors. */ + test_comp("(foobar){~2}", REG_EXTENDED, 0); + test_exec("xirefoabrzlfd", 0, REG_OK, 4, 9, 4, 9, END); + test_exec("xirefoabzlfd", 0, REG_NOMATCH); + + /* At most two inserts or substitutions and max two errors total. */ + test_comp("(foobar){+2#2~2}", REG_EXTENDED, 0); + test_exec("oobargoobaploowap", 0, REG_OK, 5, 11, 5, 11, END); + + /* Find best whole word match for "foobar". */ + test_comp("\\<(foobar){~}\\>", REG_EXTENDED, 0); + test_exec("zfoobarz", 0, REG_OK, 0, 8, 0, 8, END); + test_exec("boing zfoobarz goobar woop", 0, REG_OK, 15, 21, 15, 21, END); + + /* Match whole string, allow only 1 error. */ + test_comp("^(foobar){~1}$", REG_EXTENDED, 0); + test_exec("foobar", 0, REG_OK, 0, 6, 0, 6, END); + test_exec("xfoobar", 0, REG_OK, 0, 7, 0, 7, END); + /* + This currently fails. + test_exec("foobarx", 0, REG_OK, 0, 7, 0, 7, END); + */ + test_exec("fooxbar", 0, REG_OK, 0, 7, 0, 7, END); + test_exec("foxbar", 0, REG_OK, 0, 6, 0, 6, END); + test_exec("xoobar", 0, REG_OK, 0, 6, 0, 6, END); + test_exec("foobax", 0, REG_OK, 0, 6, 0, 6, END); + test_exec("oobar", 0, REG_OK, 0, 5, 0, 5, END); + test_exec("fobar", 0, REG_OK, 0, 5, 0, 5, END); + test_exec("fooba", 0, REG_OK, 0, 5, 0, 5, END); + test_exec("xfoobarx", 0, REG_NOMATCH); + test_exec("foobarxx", 0, REG_NOMATCH); + test_exec("xxfoobar", 0, REG_NOMATCH); + test_exec("xfoxbar", 0, REG_NOMATCH); + test_exec("foxbarx", 0, REG_NOMATCH); + + /* At most one insert, two deletes, and three substitutions. + Additionally, deletes cost two and substitutes one, and total + cost must be less than 4. */ + test_comp("(foobar){+1 -2 #3, 2d + 1s < 4}", REG_EXTENDED, 0); + test_exec("3oifaowefbaoraofuiebofasebfaobfaorfeoaro", + 0, REG_OK, 26, 33, 26, 33, END); + + /* Partially approximate matches. */ + test_comp("foo(bar){~1}zap", REG_EXTENDED, 0); + test_exec("foobarzap", 0, REG_OK, 0, 9, 3, 6, END); + test_exec("fobarzap", 0, REG_NOMATCH); + test_exec("foobrzap", 0, REG_OK, 0, 8, 3, 5, END); + test_comp("^.*(dot.org){~}.*$", REG_EXTENDED, 0); + test_exec("www.cnn.com 64.236.16.20\n" + "www.slashdot.org 66.35.250.150\n" + "For useful information, use www.slashdot.org\n" + "this is demo data!\n", + 0, REG_OK, 0, 120, 93, 100, END); + + /* Approximate matching and back referencing cannot be used together. */ + test_comp("(foo{~})\\1", REG_EXTENDED, REG_BADPAT); + +#endif /* TRE_APPROX */ + + /* + * Basic tests with pure regular expressions + */ + + /* Basic string matching. */ + test_comp("foobar", REG_EXTENDED, 0); + test_exec("foobar", 0, REG_OK, 0, 6, END); + test_exec("xxxfoobarzapzot", 0, REG_OK, 3, 9, END); + test_comp("foobar", REG_EXTENDED | REG_NOSUB, 0); + test_exec("foobar", 0, REG_OK, END); + test_comp("aaaa", REG_EXTENDED, 0); + test_exec("xxaaaaaaaaaaaaaaaaa", 0, REG_OK, 2, 6, END); + + /* Test zero length matches. */ + test_comp("(a*)", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, 0, 0, END); + + test_comp("(a*)*", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, 0, 0, END); + + test_comp("((a*)*)*", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, 0, 0, 0, 0, END); + test_comp("(a*bcd)*", REG_EXTENDED, 0); + test_exec("aaaaaaaaaaaabcxbcxbcxaabcxaabcx", 0, REG_OK, 0, 0, -1, -1, END); + test_exec("aaaaaaaaaaaabcxbcxbcxaabcxaabc", 0, REG_OK, 0, 0, -1, -1, END); + test_exec("aaaaaaaaaaaabcxbcdbcxaabcxaabc", 0, REG_OK, 0, 0, -1, -1, END); + test_exec("aaaaaaaaaaaabcdbcdbcxaabcxaabc", 0, REG_OK, 0, 18, 15, 18, END); + + test_comp("(a*)+", REG_EXTENDED, 0); + test_exec("-", 0, REG_OK, 0, 0, 0, 0, END); + + /* This test blows up the backtracking matcher. */ + avoid_eflags = REG_BACKTRACKING_MATCHER; + test_comp("((a*)*b)*b", REG_EXTENDED, 0); + test_exec("aaaaaaaaaaaaaaaaaaaaaaaaab", 0, REG_OK, + 25, 26, -1, -1, -1, -1, END); + avoid_eflags = 0; + + test_comp("", 0, 0); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("foo", 0, REG_OK, 0, 0, END); + + /* Test for submatch addressing which requires arbitrary lookahead. */ + test_comp("(a*)aaaaaa", REG_EXTENDED, 0); + test_exec("aaaaaaaaaaaaaaax", 0, REG_OK, 0, 15, 0, 9, END); + + /* Test leftmost and longest matching and some tricky submatches. */ + test_comp("(a*)(a*)", REG_EXTENDED, 0); + test_exec("aaaa", 0, REG_OK, 0, 4, 0, 4, 4, 4, END); + test_comp("(abcd|abc)(d?)", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 4, 0, 4, 4, 4, END); + test_comp("(abc|abcd)(d?)", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 4, 0, 4, 4, 4, END); + test_comp("(abc|abcd)(d?)e", REG_EXTENDED, 0); + test_exec("abcde", 0, REG_OK, 0, 5, 0, 4, 4, 4, END); + test_comp("(abcd|abc)(d?)e", REG_EXTENDED, 0); + test_exec("abcde", 0, REG_OK, 0, 5, 0, 4, 4, 4, END); + test_comp("a(bc|bcd)(d?)", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 4, 1, 4, 4, 4, END); + test_comp("a(bcd|bc)(d?)", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 4, 1, 4, 4, 4, END); + test_comp("a*(a?bc|bcd)(d?)", REG_EXTENDED, 0); + test_exec("aaabcd", 0, REG_OK, 0, 6, 3, 6, 6, 6, END); + test_comp("a*(bcd|a?bc)(d?)", REG_EXTENDED, 0); + test_exec("aaabcd", 0, REG_OK, 0, 6, 3, 6, 6, 6, END); + test_comp("(a|(a*b*))*", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, 0, 0, 0, 0, END); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, -1, -1, END); + test_exec("aa", 0, REG_OK, 0, 2, 0, 2, 0, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, 0, 3, 0, 3, END); + test_exec("bbb", 0, REG_OK, 0, 3, 0, 3, 0, 3, END); + test_exec("aaabbb", 0, REG_OK, 0, 6, 0, 6, 0, 6, END); + test_exec("bbbaaa", 0, REG_OK, 0, 6, 3, 6, 3, 6, END); + test_comp("((a*b*)|a)*", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, 0, 0, 0, 0, END); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, 0, 1, END); + test_exec("aa", 0, REG_OK, 0, 2, 0, 2, 0, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, 0, 3, 0, 3, END); + test_exec("bbb", 0, REG_OK, 0, 3, 0, 3, 0, 3, END); + test_exec("aaabbb", 0, REG_OK, 0, 6, 0, 6, 0, 6, END); + test_exec("bbbaaa", 0, REG_OK, 0, 6, 3, 6, 3, 6, END); + test_comp("a.*(.*b.*(.*c.*).*d.*).*e.*(.*f.*).*g", REG_EXTENDED, 0); + test_exec("aabbccddeeffgg", 0, REG_OK, 0, 14, 3, 9, 5, 7, 11, 13, END); + test_comp("(wee|week)(night|knights)s*", REG_EXTENDED, 0); + test_exec("weeknights", 0, REG_OK, 0, 10, 0, 3, 3, 10, END); + test_exec("weeknightss", 0, REG_OK, 0, 11, 0, 3, 3, 10, END); + test_comp("a*", REG_EXTENDED, 0); + test_exec("aaaaaaaaaa", 0, REG_OK, 0, 10, END); + test_comp("aa*", REG_EXTENDED, 0); + test_exec("aaaaaaaaaa", 0, REG_OK, 0, 10, END); + test_comp("aaa*", REG_EXTENDED, 0); + test_exec("aaaaaaaaaa", 0, REG_OK, 0, 10, END); + test_comp("aaaa*", REG_EXTENDED, 0); + test_exec("aaaaaaaaaa", 0, REG_OK, 0, 10, END); + + /* Test clearing old submatch data with nesting parentheses + and iteration. */ + test_comp("((a)|(b))*c", REG_EXTENDED, 0); + test_exec("aaabc", 0, REG_OK, 0, 5, 3, 4, -1, -1, 3, 4, END); + test_exec("aaaac", 0, REG_OK, 0, 5, 3, 4, 3, 4, -1, -1, END); + test_comp("foo((bar)*)*zot", REG_EXTENDED, 0); + test_exec("foozot", 0, REG_OK, 0, 6, 3, 3, -1, -1, END); + test_exec("foobarzot", 0, REG_OK, 0, 9, 3, 6, 3, 6, END); + test_exec("foobarbarzot", 0, REG_OK, 0, 12, 3, 9, 6, 9, END); + + test_comp("foo((zup)*|(bar)*|(zap)*)*zot", REG_EXTENDED, 0); + test_exec("foobarzapzot", 0, REG_OK, + 0, 12, 6, 9, -1, -1, -1, -1, 6, 9, END); + test_exec("foobarbarzapzot", 0, REG_OK, + 0, 15, 9, 12, -1, -1, -1, -1, 9, 12, END); + test_exec("foozupzot", 0, REG_OK, + 0, 9, 3, 6, 3, 6, -1, -1, -1, -1, END); + test_exec("foobarzot", 0, REG_OK, + 0, 9, 3, 6, -1, -1, 3, 6, -1, -1, END); + test_exec("foozapzot", 0, REG_OK, + 0, 9, 3, 6, -1, -1, -1, -1, 3, 6, END); + test_exec("foozot", 0, REG_OK, + 0, 6, 3, 3, -1, -1, -1, -1, -1, -1, END); + + + /* Test case where, e.g., Perl and Python regexp functions, and many + other backtracking matchers, fail to produce the longest match. + It is not exactly a bug since Perl does not claim to find the + longest match, but a confusing feature and, in my opinion, a bad + design choice because the union operator is traditionally defined + to be commutative (with respect to the language denoted by the RE). */ + test_comp("(a|ab)(blip)?", REG_EXTENDED, 0); + test_exec("ablip", 0, REG_OK, 0, 5, 0, 1, 1, 5, END); + test_exec("ab", 0, REG_OK, 0, 2, 0, 2, -1, -1, END); + test_comp("(ab|a)(blip)?", REG_EXTENDED, 0); + test_exec("ablip", 0, REG_OK, 0, 5, 0, 1, 1, 5, END); + test_exec("ab", 0, REG_OK, 0, 2, 0, 2, -1, -1, END); + + /* Test more submatch addressing. */ + test_comp("((a|b)*)a(a|b)*", REG_EXTENDED, 0); + test_exec("aaaaabaaaba", 0, REG_OK, 0, 11, 0, 10, 9, 10, -1, -1, END); + test_exec("aaaaabaaab", 0, REG_OK, 0, 10, 0, 8, 7, 8, 9, 10, END); + test_exec("caa", 0, REG_OK, 1, 3, 1, 2, 1, 2, -1, -1, END); + test_comp("((a|aba)*)(ababbaba)((a|b)*)", REG_EXTENDED, 0); + test_exec("aabaababbabaaababbab", 0, REG_OK, + 0, 20, 0, 4, 1, 4, 4, 12, 12, 20, 19, 20, END); + test_exec("aaaaababbaba", 0, REG_OK, + 0, 12, 0, 4, 3, 4, 4, 12, 12, 12, -1, -1, END); + test_comp("((a|aba|abb|bba|bab)*)(ababbababbabbbabbbbbbabbaba)((a|b)*)", + REG_EXTENDED, 0); + test_exec("aabaabbbbabababaababbababbabbbabbbbbbabbabababbababababbabababa", + 0, REG_OK, 0, 63, 0, 16, 13, 16, 16, 43, 43, 63, 62, 63, END); + + /* Test for empty subexpressions. */ + test_comp("", 0, 0); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("foo", 0, REG_OK, 0, 0, END); + test_comp("(a|)", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, END); + test_exec("b", 0, REG_OK, 0, 0, 0, 0, END); + test_exec("", 0, REG_OK, 0, 0, 0, 0, END); + test_comp("a|", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("b", 0, REG_OK, 0, 0, END); + test_exec("", 0, REG_OK, 0, 0, END); + test_comp("|a", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("b", 0, REG_OK, 0, 0, END); + test_exec("", 0, REG_OK, 0, 0, END); + + /* Miscellaneous tests. */ + test_comp("(a*)b(c*)", REG_EXTENDED, 0); + test_exec("abc", 0, REG_OK, 0, 3, 0, 1, 2, 3, END); + test_exec("***abc***", 0, REG_OK, 3, 6, 3, 4, 5, 6, END); + test_comp("(a)", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, END); + test_comp("((a))", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, 0, 1, END); + test_comp("(((a)))", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, 0, 1, 0, 1, END); + test_comp("((((((((((((((((((((a))))))))))))))))))))", REG_EXTENDED, 0); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, END); + + test_comp("ksntoeaiksntoeaikstneoaiksnteoaiksntoeaiskntoeaiskntoekainstoei" + "askntoeakisntoeksaitnokesantiksoentaikosentaiksoentaiksnoeaiskn" + "teoaksintoekasitnoeksaitkosetniaksoetnaisknoetakistoeksintokesa" + "nitksoentaisknoetaisknoetiaksotneaikstoekasitoeskatioksentaikso" + "enatiksoetnaiksonateiksoteaeskanotisknetaiskntoeasknitoskenatis" + "konetaisknoteai", 0, 0); + + test_comp("((aab)|(aac)|(aa*))c", REG_EXTENDED, 0); + test_exec("aabc", 0, REG_OK, 0, 4, 0, 3, 0, 3, -1, -1, -1, -1, END); + test_exec("aacc", 0, REG_OK, 0, 4, 0, 3, -1, -1, 0, 3, -1, -1, END); + test_exec("aaac", 0, REG_OK, 0, 4, 0, 3, -1, -1, -1, -1, 0, 3, END); + + test_comp("^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + REG_EXTENDED, 0); + test_exec("foo!bar!bas", 0, REG_OK, + 0, 11, 0, 11, -1, -1, -1, -1, 4, 8, 8, 11, END); + test_comp("^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + REG_EXTENDED, 0); + test_exec("foo!bar!bas", 0, REG_OK, + 0, 11, -1, -1, -1, -1, 4, 8, 8, 11, END); + test_comp("^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + REG_EXTENDED, 0); + test_exec("foo!bar!bas", 0, REG_OK, + 0, 11, 0, 11, -1, -1, -1, -1, 4, 8, 8, 11, END); + + test_comp("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + REG_EXTENDED, 0); + test_exec("Muammar Quathafi", 0, REG_OK, 0, 16, -1, -1, 11, 13, END); + + test_comp("(Ab|cD)*", REG_EXTENDED | REG_ICASE, 0); + test_exec("aBcD", 0, REG_OK, 0, 4, 2, 4, END); + + test_comp("a**", REG_EXTENDED, REG_BADRPT); + test_comp("a*+", REG_EXTENDED, REG_BADRPT); + test_comp("a+*", REG_EXTENDED, REG_BADRPT); + test_comp("a++", REG_EXTENDED, REG_BADRPT); + test_comp("a?+", REG_EXTENDED, REG_BADRPT); + test_comp("a?*", REG_EXTENDED, REG_BADRPT); + test_comp("a{1,2}*", REG_EXTENDED, REG_BADRPT); + test_comp("a{1,2}+", REG_EXTENDED, REG_BADRPT); + + /* + * Many of the following tests were mostly inspired by (or copied from) the + * libhackerlab posix test suite by Tom Lord. + */ + + test_comp("a", 0, 0); + test_exec("a", 0, REG_OK, 0, 1, END); + test_comp("\\.", 0, 0); + test_exec(".", 0, REG_OK, 0, 1, END); + test_comp("\\[", 0, 0); + test_exec("[", 0, REG_OK, 0, 1, END); + test_comp("\\\\", 0, 0); + test_exec("\\", 0, REG_OK, 0, 1, END); + test_comp("\\*", 0, 0); + test_exec("*", 0, REG_OK, 0, 1, END); + test_comp("\\^", 0, 0); + test_exec("^", 0, REG_OK, 0, 1, END); + test_comp("\\$", 0, 0); + test_exec("$", 0, REG_OK, 0, 1, END); + + test_comp("\\", 0, REG_EESCAPE); + + test_comp("x\\.", 0, 0); + test_exec("x.", 0, REG_OK, 0, 2, END); + test_comp("x\\[", 0, 0); + test_exec("x[", 0, REG_OK, 0, 2, END); + test_comp("x\\\\", 0, 0); + test_exec("x\\", 0, REG_OK, 0, 2, END); + test_comp("x\\*", 0, 0); + test_exec("x*", 0, REG_OK, 0, 2, END); + test_comp("x\\^", 0, 0); + test_exec("x^", 0, REG_OK, 0, 2, END); + test_comp("x\\$", 0, 0); + test_exec("x$", 0, REG_OK, 0, 2, END); + + test_comp("x\\", 0, REG_EESCAPE); + + test_comp(".", 0, 0); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("\n", 0, REG_OK, 0, 1, END); + + test_comp("(+|?)", 0, 0); + test_exec("(+|?)", 0, REG_OK, 0, 5, END); + test_exec("+|?", 0, REG_NOMATCH); + test_exec("(+)", 0, REG_NOMATCH); + test_exec("+", 0, REG_NOMATCH); + + + /* + * Test bracket expressions. + */ + + test_comp("[", 0, REG_EBRACK); + test_comp("[]", 0, REG_EBRACK); + test_comp("[^]", 0, REG_EBRACK); + + test_comp("[]x]", 0, 0); + test_exec("]", 0, REG_OK, 0, 1, END); + test_exec("x", 0, REG_OK, 0, 1, END); + + test_comp("[.]", 0, 0); + test_exec(".", 0, REG_OK, 0, 1, END); + test_exec("a", 0, REG_NOMATCH); + + test_comp("[*]", 0, 0); + test_exec("*", 0, REG_OK, 0, 1, END); + + test_comp("[[]", 0, 0); + test_exec("[", 0, REG_OK, 0, 1, END); + + test_comp("[\\]", 0, 0); + test_exec("\\", 0, REG_OK, 0, 1, END); + + test_comp("[-x]", 0, 0); + test_exec("-", 0, REG_OK, 0, 1, END); + test_exec("x", 0, REG_OK, 0, 1, END); + test_comp("[x-]", 0, 0); + test_exec("-", 0, REG_OK, 0, 1, END); + test_exec("x", 0, REG_OK, 0, 1, END); + test_comp("[-]", 0, 0); + test_exec("-", 0, REG_OK, 0, 1, END); + + test_comp("[abc]", 0, 0); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("b", 0, REG_OK, 0, 1, END); + test_exec("c", 0, REG_OK, 0, 1, END); + test_exec("d", 0, REG_NOMATCH); + test_exec("xa", 0, REG_OK, 1, 2, END); + test_exec("xb", 0, REG_OK, 1, 2, END); + test_exec("xc", 0, REG_OK, 1, 2, END); + test_exec("xd", 0, REG_NOMATCH); + test_comp("x[abc]", 0, 0); + test_exec("xa", 0, REG_OK, 0, 2, END); + test_exec("xb", 0, REG_OK, 0, 2, END); + test_exec("xc", 0, REG_OK, 0, 2, END); + test_exec("xd", 0, REG_NOMATCH); + test_comp("[^abc]", 0, 0); + test_exec("a", 0, REG_NOMATCH); + test_exec("b", 0, REG_NOMATCH); + test_exec("c", 0, REG_NOMATCH); + test_exec("d", 0, REG_OK, 0, 1, END); + test_exec("xa", 0, REG_OK, 0, 1, END); + test_exec("xb", 0, REG_OK, 0, 1, END); + test_exec("xc", 0, REG_OK, 0, 1, END); + test_exec("xd", 0, REG_OK, 0, 1, END); + test_comp("x[^abc]", 0, 0); + test_exec("xa", 0, REG_NOMATCH); + test_exec("xb", 0, REG_NOMATCH); + test_exec("xc", 0, REG_NOMATCH); + test_exec("xd", 0, REG_OK, 0, 2, END); + + test_comp("[()+?*\\]+", REG_EXTENDED, 0); + test_exec("x\\*?+()x", 0, REG_OK, 1, 7, END); + + /* Standard character classes. */ + test_comp("[[:alnum:]]+", REG_EXTENDED, 0); + test_exec("%abc123890XYZ=", 0, REG_OK, 1, 13, END); + test_comp("[[:cntrl:]]+", REG_EXTENDED, 0); + test_exec("%\n\t\015\f ", 0, REG_OK, 1, 5, END); + test_comp("[[:lower:]]+", REG_EXTENDED, 0); + test_exec("AbcdE", 0, REG_OK, 1, 4, END); + test_comp("[[:lower:]]+", REG_EXTENDED | REG_ICASE, 0); + test_exec("AbcdE", 0, REG_OK, 0, 5, END); + test_comp("[[:space:]]+", REG_EXTENDED, 0); + test_exec("x \t\f\nx", 0, REG_OK, 1, 5, END); + test_comp("[[:alpha:]]+", REG_EXTENDED, 0); + test_exec("%abC123890xyz=", 0, REG_OK, 1, 4, END); + test_comp("[[:digit:]]+", REG_EXTENDED, 0); + test_exec("%abC123890xyz=", 0, REG_OK, 4, 10, END); + test_comp("[^[:digit:]]+", REG_EXTENDED, 0); + test_exec("%abC123890xyz=", 0, REG_OK, 0, 4, END); + test_comp("[[:print:]]+", REG_EXTENDED, 0); + test_exec("\n\t %abC12\f", 0, REG_OK, 2, 9, END); + test_comp("[[:upper:]]+", REG_EXTENDED, 0); + test_exec("\n aBCDEFGHIJKLMNOPQRSTUVWXYz", 0, REG_OK, 3, 27, END); + test_comp("[[:upper:]]+", REG_EXTENDED | REG_ICASE, 0); + test_exec("\n aBCDEFGHIJKLMNOPQRSTUVWXYz", 0, REG_OK, 2, 28, END); +#ifdef HAVE_ISWBLANK +#ifdef HAVE_ISBLANK + test_comp("[[:blank:]]+", REG_EXTENDED, 0); + test_exec("\na \t b", 0, REG_OK, 2, 5, END); +#endif /* HAVE_ISBLANK */ +#endif /* HAVE_ISWBLANK */ + test_comp("[[:graph:]]+", REG_EXTENDED, 0); + test_exec("\n %abC12\f", 0, REG_OK, 2, 8, END); + test_comp("[[:punct:]]+", REG_EXTENDED, 0); + test_exec("a~!@#$%^&*()_+=-`[]{};':\"|\\,./?>< ", + 0, REG_OK, 1, 33, END); + test_comp("[[:xdigit:]]+", REG_EXTENDED, 0); + test_exec("-0123456789ABCDEFabcdef", 0, REG_OK, 1, 23, END); + test_comp("[[:bogus-character-class-name:]", REG_EXTENDED, REG_ECTYPE); + test_comp("[[:\xff:", REG_EXTENDED, REG_ECTYPE); + + + /* Range expressions (assuming that the C locale is being used). */ + test_comp("[a-z]+", REG_EXTENDED, 0); + test_exec("ABCabcxyzABC", 0, REG_OK, 3, 9, END); + test_comp("[z-a]+", REG_EXTENDED, REG_ERANGE); + test_comp("[a-b-c]", 0, REG_ERANGE); + test_comp("[a-a]+", REG_EXTENDED, 0); + test_exec("zaaaaab", 0, REG_OK, 1, 6, END); + test_comp("[--Z]+", REG_EXTENDED, 0); + test_exec("!ABC-./XYZ~", 0, REG_OK, 1, 10, END); + test_comp("[*--]", 0, 0); + test_exec("-", 0, REG_OK, 0, 1, END); + test_exec("*", 0, REG_OK, 0, 1, END); + test_comp("[*--Z]+", REG_EXTENDED, 0); + test_exec("!+*,---ABC", 0, REG_OK, 1, 7, END); + test_comp("[a-]+", REG_EXTENDED, 0); + test_exec("xa-a--a-ay", 0, REG_OK, 1, 9, END); + + /* REG_ICASE and character sets. */ + test_comp("[a-c]*", REG_ICASE | REG_EXTENDED, 0); + test_exec("cABbage", 0, REG_OK, 0, 5, END); + test_comp("[^a-c]*", REG_ICASE | REG_EXTENDED, 0); + test_exec("tObAcCo", 0, REG_OK, 0, 2, END); + test_comp("[A-C]*", REG_ICASE | REG_EXTENDED, 0); + test_exec("cABbage", 0, REG_OK, 0, 5, END); + test_comp("[^A-C]*", REG_ICASE | REG_EXTENDED, 0); + test_exec("tObAcCo", 0, REG_OK, 0, 2, END); + + /* Complex character sets. */ + test_comp("[[:digit:]a-z#$%]+", REG_EXTENDED, 0); + test_exec("__abc#lmn012$x%yz789*", 0, REG_OK, 2, 20, END); + test_comp("[[:digit:]a-z#$%]+", REG_ICASE | REG_EXTENDED, 0); + test_exec("__abcLMN012x%#$yz789*", 0, REG_OK, 2, 20, END); + test_comp("[^[:digit:]a-z#$%]+", REG_EXTENDED, 0); + test_exec("abc#lmn012$x%yz789--@*,abc", 0, REG_OK, 18, 23, END); + test_comp("[^[:digit:]a-z#$%]+", REG_ICASE | REG_EXTENDED, 0); + test_exec("abc#lmn012$x%yz789--@*,abc", 0, REG_OK, 18, 23, END); + test_comp("[^[:digit:]#$%[:xdigit:]]+", REG_ICASE | REG_EXTENDED, 0); + test_exec("abc#lmn012$x%yz789--@*,abc", 0, REG_OK, 4, 7, END); + test_comp("[^-]+", REG_EXTENDED, 0); + test_exec("---afd*(&,ml---", 0, REG_OK, 3, 12, END); + test_comp("[^--Z]+", REG_EXTENDED, 0); + test_exec("---AFD*(&,ml---", 0, REG_OK, 6, 12, END); + test_comp("[^--Z]+", REG_ICASE | REG_EXTENDED, 0); + test_exec("---AFD*(&,ml---", 0, REG_OK, 6, 10, END); + + /* Unsupported things (equivalence classes and multicharacter collating + elements) */ + test_comp("[[.foo.]]", 0, REG_ECOLLATE); + test_comp("[[=foo=]]", 0, REG_ECOLLATE); + test_comp("[[..]]", 0, REG_ECOLLATE); + test_comp("[[==]]", 0, REG_ECOLLATE); + test_comp("[[.]]", 0, REG_ECOLLATE); + test_comp("[[=]]", 0, REG_ECOLLATE); + test_comp("[[.]", 0, REG_ECOLLATE); + test_comp("[[=]", 0, REG_ECOLLATE); + test_comp("[[.", 0, REG_ECOLLATE); + test_comp("[[=", 0, REG_ECOLLATE); + + + + /* Miscellaneous tests. */ + test_comp("abc\\(\\(de\\)\\(fg\\)\\)hi", 0, 0); + test_exec("xabcdefghiy", 0, REG_OK, 1, 10, 4, 8, 4, 6, 6, 8, END); + + test_comp("abc*def", 0, 0); + test_exec("xabdefy", 0, REG_OK, 1, 6, END); + test_exec("xabcdefy", 0, REG_OK, 1, 7, END); + test_exec("xabcccccccdefy", 0, REG_OK, 1, 13, END); + + test_comp("abc\\(def\\)*ghi", 0, 0); + test_exec("xabcghiy", 0, REG_OK, 1, 7, -1, -1, END); + test_exec("xabcdefghi", 0, REG_OK, 1, 10, 4, 7, END); + test_exec("xabcdefdefdefghi", 0, REG_OK, 1, 16, 10, 13, END); + + test_comp("a?", REG_EXTENDED, REG_OK); + test_exec("aaaaa", 0, REG_OK, 0, 1, END); + test_exec("xaaaaa", 0, REG_OK, 0, 0, END); + test_comp("a+", REG_EXTENDED, REG_OK); + test_exec("aaaaa", 0, REG_OK, 0, 5, END); + test_exec("xaaaaa", 0, REG_OK, 1, 6, END); + + + /* + * Test anchors and their behaviour with the REG_NEWLINE compilation + * flag and the REG_NOTBOL, REG_NOTEOL execution flags. + */ + + /* Normally, `^' matches the empty string at beginning of input. + If REG_NOTBOL is used, `^' won't match the zero length string. */ + test_comp("^abc", 0, 0); + test_exec("abcdef", 0, REG_OK, 0, 3, END); + test_exec("abcdef", REG_NOTBOL, REG_NOMATCH); + test_exec("xyzabcdef", 0, REG_NOMATCH); + test_exec("xyzabcdef", REG_NOTBOL, REG_NOMATCH); + test_exec("\nabcdef", 0, REG_NOMATCH); + test_exec("\nabcdef", REG_NOTBOL, REG_NOMATCH); + + /* Normally, `$' matches the empty string at end of input. + If REG_NOTEOL is used, `$' won't match the zero length string. */ + test_comp("abc$", 0, 0); + test_exec("defabc", 0, REG_OK, 3, 6, END); + test_exec("defabc", REG_NOTEOL, REG_NOMATCH); + test_exec("defabcxyz", 0, REG_NOMATCH); + test_exec("defabcxyz", REG_NOTEOL, REG_NOMATCH); + test_exec("defabc\n", 0, REG_NOMATCH); + test_exec("defabc\n", REG_NOTEOL, REG_NOMATCH); + + test_comp("^abc$", 0, 0); + test_exec("abc", 0, REG_OK, 0, 3, END); + test_exec("abc", REG_NOTBOL, REG_NOMATCH); + test_exec("abc", REG_NOTEOL, REG_NOMATCH); + test_exec("abc", REG_NOTBOL | REG_NOTEOL, REG_NOMATCH); + test_exec("\nabc\n", 0, REG_NOMATCH); + test_exec("defabc\n", 0, REG_NOMATCH); + test_exec("\nabcdef", 0, REG_NOMATCH); + test_exec("abcdef", 0, REG_NOMATCH); + test_exec("defabc", 0, REG_NOMATCH); + test_exec("abc\ndef", 0, REG_NOMATCH); + test_exec("def\nabc", 0, REG_NOMATCH); + + /* If REG_NEWLINE is used, `^' matches the empty string immediately after + a newline, regardless of whether execution flags contain REG_NOTBOL. + Similarly, if REG_NEWLINE is used, `$' matches the empty string + immediately before a newline, regardless of execution flags. */ + test_comp("^abc", REG_NEWLINE, 0); + test_exec("abcdef", 0, REG_OK, 0, 3, END); + test_exec("abcdef", REG_NOTBOL, REG_NOMATCH); + test_exec("xyzabcdef", 0, REG_NOMATCH); + test_exec("xyzabcdef", REG_NOTBOL, REG_NOMATCH); + test_exec("\nabcdef", 0, REG_OK, 1, 4, END); + test_exec("\nabcdef", REG_NOTBOL, 0, 1, 4, END); + test_comp("abc$", REG_NEWLINE, 0); + test_exec("defabc", 0, REG_OK, 3, 6, END); + test_exec("defabc", REG_NOTEOL, REG_NOMATCH); + test_exec("defabcxyz", 0, REG_NOMATCH); + test_exec("defabcxyz", REG_NOTEOL, REG_NOMATCH); + test_exec("defabc\n", 0, REG_OK, 3, 6, END); + test_exec("defabc\n", REG_NOTEOL, 0, 3, 6, END); + test_comp("^abc$", REG_NEWLINE, 0); + test_exec("abc", 0, REG_OK, 0, 3, END); + test_exec("abc", REG_NOTBOL, REG_NOMATCH); + test_exec("abc", REG_NOTEOL, REG_NOMATCH); + test_exec("abc", REG_NOTBOL | REG_NOTEOL, REG_NOMATCH); + test_exec("\nabc\n", 0, REG_OK, 1, 4, END); + test_exec("defabc\n", 0, REG_NOMATCH); + test_exec("\nabcdef", 0, REG_NOMATCH); + test_exec("abcdef", 0, REG_NOMATCH); + test_exec("abcdef", REG_NOTBOL, REG_NOMATCH); + test_exec("defabc", 0, REG_NOMATCH); + test_exec("defabc", REG_NOTEOL, REG_NOMATCH); + test_exec("abc\ndef", 0, REG_OK, 0, 3, END); + test_exec("abc\ndef", REG_NOTBOL, REG_NOMATCH); + test_exec("abc\ndef", REG_NOTEOL, 0, 0, 3, END); + test_exec("abc\ndef", REG_NOTBOL | REG_NOTEOL, REG_NOMATCH); + test_exec("def\nabc", 0, REG_OK, 4, 7, END); + test_exec("def\nabc", REG_NOTBOL, 0, 4, 7, END); + test_exec("def\nabc", REG_NOTEOL, REG_NOMATCH); + test_exec("def\nabc", REG_NOTBOL | REG_NOTEOL, REG_NOMATCH); + + /* With BRE syntax, `^' has a special meaning only at the beginning of the + RE or the beginning of a parenthesized subexpression. */ + test_comp("a\\{0,1\\}^bc", 0, 0); + test_exec("bc", 0, REG_NOMATCH); + test_exec("^bc", 0, REG_OK, 0, 3, END); + test_exec("abc", 0, REG_NOMATCH); + test_exec("a^bc", 0, REG_OK, 0, 4, END); + test_comp("a\\{0,1\\}\\(^bc\\)", 0, 0); + test_exec("bc", 0, REG_OK, 0, 2, 0, 2, END); + test_exec("^bc", 0, REG_NOMATCH); + test_exec("abc", 0, REG_NOMATCH); + test_exec("a^bc", 0, REG_NOMATCH); + test_comp("(^a", 0, 0); + test_exec("(^a", 0, REG_OK, 0, 3, END); + + /* With BRE syntax, `$' has a special meaning only at the end of the + RE or the end of a parenthesized subexpression. */ + test_comp("ab$c\\{0,1\\}", 0, 0); + test_exec("ab", 0, REG_NOMATCH); + test_exec("ab$", 0, REG_OK, 0, 3, END); + test_exec("abc", 0, REG_NOMATCH); + test_exec("ab$c", 0, REG_OK, 0, 4, END); + test_comp("\\(ab$\\)c\\{0,1\\}", 0, 0); + test_exec("ab", 0, REG_OK, 0, 2, 0, 2, END); + test_exec("ab$", 0, REG_NOMATCH); + test_exec("abc", 0, REG_NOMATCH); + test_exec("ab$c", 0, REG_NOMATCH); + test_comp("a$)", 0, 0); + test_exec("a$)", 0, REG_OK, 0, 3, END); + + /* Miscellaneous tests for `^' and `$'. */ + test_comp("foo^$", REG_EXTENDED, 0); + test_exec("foo", 0, REG_NOMATCH); + test_comp("x$\n^y", REG_EXTENDED | REG_NEWLINE, 0); + test_exec("foo\nybarx\nyes\n", 0, REG_OK, 8, 11, END); + test_comp("^$", 0, 0); + test_exec("x", 0, REG_NOMATCH); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("\n", 0, REG_NOMATCH); + test_comp("^$", REG_NEWLINE, 0); + test_exec("x", 0, REG_NOMATCH); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("\n", 0, REG_OK, 0, 0, END); + + /* REG_NEWLINE causes `.' not to match newlines. */ + test_comp(".*", 0, 0); + test_exec("ab\ncd", 0, REG_OK, 0, 5, END); + test_comp(".*", REG_NEWLINE, 0); + test_exec("ab\ncd", 0, REG_OK, 0, 2, END); + + /* + * Tests for nonstandard syntax extensions. + */ + + /* Zero width assertions. */ + test_comp("\\", REG_EXTENDED, 0); + test_exec("axx xaa", 0, REG_OK, 2, 3, END); + test_exec("aax", 0, REG_OK, 2, 3, END); + test_comp("\\bx", REG_EXTENDED, 0); + test_exec("axx xaa", 0, REG_OK, 4, 5, END); + test_exec("aax", 0, REG_NOMATCH); + test_exec("xax", 0, REG_OK, 0, 1, END); + test_comp("x\\b", REG_EXTENDED, 0); + test_exec("axx xaa", 0, REG_OK, 2, 3, END); + test_exec("aax", 0, REG_OK, 2, 3, END); + test_exec("xaa", 0, REG_NOMATCH); + test_comp("\\Bx", REG_EXTENDED, 0); + test_exec("aax xxa", 0, REG_OK, 2, 3, END); + test_comp("\\Bx\\b", REG_EXTENDED, 0); + test_exec("aax xxx", 0, REG_OK, 2, 3, END); + test_comp("\\<.", REG_EXTENDED, 0); + test_exec(";xaa", 0, REG_OK, 1, 2, END); + + /* Shorthands for character classes. */ + test_comp("\\w+", REG_EXTENDED, 0); +#ifdef SRC_IN_ISO_8859_1 + test_exec(",.(a23_Nt-o)", 0, REG_OK, 3, 9, END); +#else +#ifdef SRC_IN_UTF_8 + /* iconv -f ISO-8859-1 -t UTF-8 file_with_lines_above > www_utf_8 */ + test_exec(",.(a23_Nt-öo)", 0, REG_OK, 3, 9, END); +#else + unsigned char str_000[] = { + ',','.','(','a','2','3','_','N','t','-',0xF6,'o',0x00 + }; + test_exec((char const *)str_000, 0, REG_OK, 3, 9, END); +#endif +#endif + test_comp("\\d+", REG_EXTENDED, 0); + test_exec("uR120_4=v4", 0, REG_OK, 2, 5, END); + test_comp("\\D+", REG_EXTENDED, 0); + test_exec("120d_=vA4s", 0, REG_OK, 3, 8, END); + + /* Quoted special characters. */ + test_comp("\\t", REG_EXTENDED, 0); + test_comp("\\e", REG_EXTENDED, 0); + + /* Test the \x1B and \x{263a} extensions for specifying 8 bit and wide + characters in hexadecimal. */ + test_comp("\\x41", REG_EXTENDED, 0); + test_exec("ABC", 0, REG_OK, 0, 1, END); + test_comp("\\x5", REG_EXTENDED, 0); + test_exec("\005", 0, REG_OK, 0, 1, END); + test_comp("\\x5r", REG_EXTENDED, 0); + test_exec("\005r", 0, REG_OK, 0, 2, END); + test_comp("\\x", REG_EXTENDED, 0); + test_nexec("\000", 1, 0, REG_OK, 0, 1, END); + test_comp("\\xr", REG_EXTENDED, 0); + test_nexec("\000r", 2, 0, REG_OK, 0, 2, END); + test_comp("\\x{41}", REG_EXTENDED, 0); + test_exec("ABC", 0, REG_OK, 0, 1, END); + test_comp("\\x{5}", REG_EXTENDED, 0); + test_exec("\005", 0, REG_OK, 0, 1, END); + test_comp("\\x{5}r", REG_EXTENDED, 0); + test_exec("\005r", 0, REG_OK, 0, 2, END); + test_comp("\\x{}", REG_EXTENDED, 0); + test_nexec("\000", 1, 0, REG_OK, 0, 1, END); + test_comp("\\x{}r", REG_EXTENDED, 0); + test_nexec("\000r", 2, 0, REG_OK, 0, 2, END); + test_comp("\\x{00000000}", REG_EXTENDED, 0); + test_comp("\\x{000000000}", REG_EXTENDED, REG_EBRACE); + + /* Tests for (?inrU-inrU) and (?inrU-inrU:) */ + test_comp("foo(?i)bar", REG_EXTENDED, 0); + test_exec("fooBaR", 0, REG_OK, 0, 6, END); + test_comp("foo(?i)bar|zap", REG_EXTENDED, 0); + test_exec("fooBaR", 0, REG_OK, 0, 6, END); + test_exec("foozap", 0, REG_OK, 0, 6, END); + test_exec("foozAp", 0, REG_OK, 0, 6, END); + test_exec("zap", 0, REG_NOMATCH); + test_comp("foo(?-i:zap)zot", REG_EXTENDED | REG_ICASE, 0); + test_exec("FoOzapZOt", 0, REG_OK, 0, 9, END); + test_exec("FoOzApZOt", 0, REG_NOMATCH); + test_comp("foo(?i:bar|zap)", REG_EXTENDED, 0); + test_exec("foozap", 0, REG_OK, 0, 6, END); + test_exec("foobar", 0, REG_OK, 0, 6, END); + test_exec("foobAr", 0, REG_OK, 0, 6, END); + test_exec("fooZaP", 0, REG_OK, 0, 6, END); + test_comp("foo(?U:o*)(o*)", REG_EXTENDED, 0); + test_exec("foooo", 0, REG_OK, 0, 5, 3, 5, END); + + /* Test comment syntax. */ + test_comp("foo(?# This here is a comment. )bar", REG_EXTENDED, 0); + test_exec("foobar", 0, REG_OK, 0, 6, END); + + /* Tests for \Q and \E. */ + test_comp("\\((\\Q)?:\\<[^$\\E)", REG_EXTENDED, 0); + test_exec("()?:\\<[^$", 0, REG_OK, 0, 9, 1, 9, END); + test_comp("\\Qabc\\E.*", REG_EXTENDED, 0); + test_exec("abcdef", 0, REG_OK, 0, 6, END); + test_comp("\\Qabc\\E.*|foo", REG_EXTENDED, 0); + test_exec("parabc123wxyz", 0, REG_OK, 3, 13, END); + test_exec("fooabc123wxyz", 0, REG_OK, 0, 3, END); + + /* + * Test integer parser used for bounded repititions. + */ + + test_comp("a{9223372036854775808,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{9223372036854775808}", REG_EXTENDED, REG_BADMAX); + test_comp("a{9223372036854775807,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{9223372036854775807}", REG_EXTENDED, REG_BADMAX); + test_comp("a{2147483648,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{2147483648}", REG_EXTENDED, REG_BADMAX); + test_comp("a{2147483647,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{2147483647}", REG_EXTENDED, REG_BADMAX); + test_comp("a{32768,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{32768}", REG_EXTENDED, REG_BADMAX); + test_comp("a{32767,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{32767}", REG_EXTENDED, REG_BADMAX); + test_comp("a{256,}", REG_EXTENDED, REG_BADMAX); + test_comp("a{256}", REG_EXTENDED, REG_BADMAX); + test_comp("a{255,}", REG_EXTENDED, REG_OK); + test_comp("a{255}", REG_EXTENDED, REG_OK); + + /* + * Test bounded repetitions. + */ + + test_comp("a{0,0}", REG_EXTENDED, REG_OK); + test_exec("aaa", 0, REG_OK, 0, 0, END); + test_comp("a{0,1}", REG_EXTENDED, REG_OK); + test_exec("aaa", 0, REG_OK, 0, 1, END); + test_comp("a{1,1}", REG_EXTENDED, REG_OK); + test_exec("aaa", 0, REG_OK, 0, 1, END); + test_comp("a{1,3}", REG_EXTENDED, REG_OK); + test_exec("xaaaaa", 0, REG_OK, 1, 4, END); + test_comp("a{0,3}", REG_EXTENDED, REG_OK); + test_exec("aaaaa", 0, REG_OK, 0, 3, END); + test_comp("a{0,}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("aa", 0, REG_OK, 0, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, END); + test_comp("a{1,}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_NOMATCH); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("aa", 0, REG_OK, 0, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, END); + test_comp("a{2,}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_NOMATCH); + test_exec("a", 0, REG_NOMATCH); + test_exec("aa", 0, REG_OK, 0, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, END); + test_comp("a{3,}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_NOMATCH); + test_exec("a", 0, REG_NOMATCH); + test_exec("aa", 0, REG_NOMATCH); + test_exec("aaa", 0, REG_OK, 0, 3, END); + test_exec("aaaa", 0, REG_OK, 0, 4, END); + test_exec("aaaaa", 0, REG_OK, 0, 5, END); + test_exec("aaaaaa", 0, REG_OK, 0, 6, END); + test_exec("aaaaaaa", 0, REG_OK, 0, 7, END); + test_comp("a{,}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("aaa", 0, REG_OK, 0, 3, END); + test_comp("a{,0}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("aaa", 0, REG_OK, 0, 0, END); + test_comp("a{,1}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("aa", 0, REG_OK, 0, 1, END); + test_comp("a{,2}", REG_EXTENDED, REG_OK); + test_exec("", 0, REG_OK, 0, 0, END); + test_exec("a", 0, REG_OK, 0, 1, END); + test_exec("aa", 0, REG_OK, 0, 2, END); + test_exec("aaa", 0, REG_OK, 0, 2, END); + + test_comp("a{5,10}", REG_EXTENDED, REG_OK); + test_comp("a{6,6}", REG_EXTENDED, REG_OK); + test_exec("aaaaaaaaaaaa", 0, REG_OK, 0, 6, END); + test_exec("xxaaaaaaaaaaaa", 0, REG_OK, 2, 8, END); + test_exec("xxaaaaa", 0, REG_NOMATCH); + test_comp("a{5,6}", REG_EXTENDED, REG_OK); + test_exec("aaaaaaaaaaaa", 0, REG_OK, 0, 6, END); + test_exec("xxaaaaaaaaaaaa", 0, REG_OK, 2, 8, END); + test_exec("xxaaaaa", 0, REG_OK, 2, 7, END); + test_exec("xxaaaa", 0, REG_NOMATCH); + + /* Trickier ones... */ + test_comp("([ab]{5,10})*b", REG_EXTENDED, REG_OK); + test_exec("bbbbbabaaaaab", 0, REG_OK, 0, 13, 5, 12, END); + test_exec("bbbbbbaaaaab", 0, REG_OK, 0, 12, 5, 11, END); + test_exec("bbbbbbaaaab", 0, REG_OK, 0, 11, 0, 10, END); + test_exec("bbbbbbaaab", 0, REG_OK, 0, 10, 0, 9, END); + test_exec("bbbbbbaab", 0, REG_OK, 0, 9, 0, 8, END); + test_exec("bbbbbbab", 0, REG_OK, 0, 8, 0, 7, END); + + test_comp("([ab]*)(ab[ab]{5,10})ba", REG_EXTENDED, REG_OK); + test_exec("abbabbbabaabbbbbbbbbbbbbabaaaabab", 0, REG_OK, + 0, 10, 0, 0, 0, 8, END); + test_exec("abbabbbabaabbbbbbbbbbbbabaaaaabab", 0, REG_OK, + 0, 32, 0, 23, 23, 30, END); + test_exec("abbabbbabaabbbbbbbbbbbbabaaaabab", 0, REG_OK, + 0, 24, 0, 10, 10, 22, END); + test_exec("abbabbbabaabbbbbbbbbbbba", 0, REG_OK, + 0, 24, 0, 10, 10, 22, END); + + test_comp("^((a{1,2})?x)*y", REG_EXTENDED | REG_NOSUB, REG_OK); + test_exec("y", 0, REG_OK, END); + test_exec("xy", 0, REG_OK, END); + test_exec("axy", 0, REG_OK, END); + test_exec("aaxy", 0, REG_OK, END); + test_exec("aaaxy", 0, REG_NOMATCH, END); + + /* Test repeating something that has submatches inside. */ + test_comp("(a){0,5}", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, -1, -1, END); + test_exec("a", 0, REG_OK, 0, 1, 0, 1, END); + test_exec("aa", 0, REG_OK, 0, 2, 1, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, 2, 3, END); + test_exec("aaaa", 0, REG_OK, 0, 4, 3, 4, END); + test_exec("aaaaa", 0, REG_OK, 0, 5, 4, 5, END); + test_exec("aaaaaa", 0, REG_OK, 0, 5, 4, 5, END); + + test_comp("(a){2,3}", REG_EXTENDED, 0); + test_exec("", 0, REG_NOMATCH); + test_exec("a", 0, REG_NOMATCH); + test_exec("aa", 0, REG_OK, 0, 2, 1, 2, END); + test_exec("aaa", 0, REG_OK, 0, 3, 2, 3, END); + test_exec("aaaa", 0, REG_OK, 0, 3, 2, 3, END); + + test_comp("\\(a\\)\\{4\\}", 0, 0); + test_exec("aaaa", 0, REG_OK, 0, 4, 3, 4, END); + + test_comp("\\(a*\\)\\{2\\}", 0, 0); + test_exec("a", 0, REG_OK, 0, 1, 1, 1, END); + + test_comp("((..)|(.)){2}", REG_EXTENDED, 0); + test_exec("aa", 0, REG_OK, 0, 2, 1, 2, -1, -1, 1, 2, END); + + /* Nested repeats. */ + test_comp("(.){2}{3}", REG_EXTENDED, 0); + test_exec("xxxxx", 0, REG_NOMATCH); + test_exec("xxxxxx", 0, REG_OK, 0, 6, 5, 6, END); + test_comp("(..){2}{3}", REG_EXTENDED, 0); + test_exec("xxxxxxxxxxx", 0, REG_NOMATCH); + test_exec("xxxxxxxxxxxx", 0, REG_OK, 0, 12, 10, 12, END); + test_comp("((..){2}.){3}", REG_EXTENDED, 0); + test_exec("xxxxxxxxxxxxxx", 0, REG_NOMATCH); + test_exec("xxxxxxxxxxxxxxx", 0, REG_OK, 0, 15, 10, 15, 12, 14, END); + test_comp("((..){1,2}.){3}", REG_EXTENDED, 0); + test_exec("xxxxxxxx", 0, REG_NOMATCH); + test_exec("xxxxxxxxx", 0, REG_OK, 0, 9, 6, 9, 6, 8, END); + test_exec("xxxxxxxxxx", 0, REG_OK, 0, 9, 6, 9, 6, 8, END); + test_exec("xxxxxxxxxxx", 0, REG_OK, 0, 11, 8, 11, 8, 10, END); + test_comp("a{2}{2}x", REG_EXTENDED, 0); + test_exec("", 0, REG_NOMATCH); + test_exec("x", 0, REG_NOMATCH); + test_exec("ax", 0, REG_NOMATCH); + test_exec("aax", 0, REG_NOMATCH); + test_exec("aaax", 0, REG_NOMATCH); + test_exec("aaaax", 0, REG_OK, 0, 5, END); + test_exec("aaaaax", 0, REG_OK, 1, 6, END); + test_exec("aaaaaax", 0, REG_OK, 2, 7, END); + test_exec("aaaaaaax", 0, REG_OK, 3, 8, END); + test_exec("aaaaaaaax", 0, REG_OK, 4, 9, END); + + /* Repeats with iterations inside. */ + test_comp("([a-z]+){2,5}", REG_EXTENDED, 0); + test_exec("a\n", 0, REG_NOMATCH); + test_exec("aa\n", 0, REG_OK, 0, 2, 1, 2, END); + + /* Multiple repeats in one regexp. */ + test_comp("a{3}b{3}", REG_EXTENDED, 0); + test_exec("aaabbb", 0, REG_OK, 0, 6, END); + test_exec("aaabbbb", 0, REG_OK, 0, 6, END); + test_exec("aaaabbb", 0, REG_OK, 1, 7, END); + test_exec("aabbb", 0, REG_NOMATCH); + test_exec("aaabb", 0, REG_NOMATCH); + + /* Test that different types of repetitions work correctly when used + in the same regexp. */ + test_comp("a{2}{2}xb+xc*xd?x", REG_EXTENDED, 0); + test_exec("aaaaxbxcxdx", 0, REG_OK, 0, 11, END); + test_exec("aaaxbxcxdx", 0, REG_NOMATCH); + test_exec("aabxcxdx", 0, REG_NOMATCH); + test_exec("aaaacxdx", 0, REG_NOMATCH); + test_exec("aaaaxbdx", 0, REG_NOMATCH); + test_comp("^!packet [0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3} [0-9]+", + REG_EXTENDED, 0); + test_exec("!packet 10.0.2.4 12765 ei voittoa", 0, REG_OK, 0, 22, END); + + /* + * Back referencing tests. + */ + test_comp("([a-z]*) \\1", REG_EXTENDED, 0); + test_exec("foobar foobar", 0, REG_OK, 0, 13, 0, 6, END); + + /* Searching for a leftmost longest square (repeated string) */ + test_comp("(.*)\\1", REG_EXTENDED, 0); + test_exec("foobarfoobar", 0, REG_OK, 0, 12, 0, 6, END); + + test_comp("a(b)*c\\1", REG_EXTENDED, 0); + test_exec("acb", 0, REG_OK, 0, 2, -1, -1, END); + test_exec("abbcbbb", 0, REG_OK, 0, 5, 2, 3, END); + test_exec("abbdbd", 0, REG_NOMATCH); + + test_comp("([a-c]*)\\1", REG_EXTENDED, 0); + test_exec("abcacdef", 0, REG_OK, 0, 0, 0, 0, END); + test_exec("abcabcabcd", 0, REG_OK, 0, 6, 0, 3, END); + test_comp("(.{1,3})\\1", REG_EXTENDED, 0); + test_exec("foo", 0, REG_OK, 1, 3, 1, 2, END); + + test_comp("\\(a*\\)*\\(x\\)\\(\\1\\)", 0, 0); + test_exec("x", 0, REG_OK, 0, 1, 0, 0, 0, 1, 1, 1, END); +#if KNOWN_BUG + test_exec("ax", 0, REG_OK, 0, 2, 1, 1, 1, 2, 2, 2, END); +#endif + + test_comp("(a)\\1{1,2}", REG_EXTENDED, 0); + test_exec("aabc", 0, REG_OK, 0, 2, 0, 1, END); + + test_comp("((.*)\\1)+", REG_EXTENDED, 0); + test_exec("aa", 0, REG_OK, 0, 2, 0, 2, 0, 1, END); + +#if KNOWN_BUG + test_comp("()(\\1\\1)*", REG_EXTENDED, 0); + test_exec("", 0, REG_OK, 0, 0, 0, 0, 0, 0, END); +#endif + + /* Check that back references work with REG_NOSUB. */ + test_comp("(o)\\1", REG_EXTENDED | REG_NOSUB, 0); + test_exec("foobar", 0, REG_OK, END); + test_comp("(o)\\1", REG_EXTENDED, 0); + test_exec("foobar", 0, REG_OK, 1, 3, 1, 2, END); + test_comp("(o)\\1", REG_EXTENDED, 0); + test_exec("fobar", 0, REG_NOMATCH); + + test_comp("\\1foo", REG_EXTENDED, REG_ESUBREG); + test_comp("\\1foo(bar)", REG_EXTENDED, 0); + + /* Back reference with zero-width assertion. */ + test_comp("(.)\\1$", REG_EXTENDED, 0); + test_exec("foox", 0, REG_NOMATCH); + test_exec("foo", 0, REG_OK, 1, 3, 1, 2, END); + + /* Back references together with {}. */ + test_comp("([0-9]{5})\\1", REG_EXTENDED, 0); + test_exec("12345", 0, REG_NOMATCH); + test_exec("1234512345", 0, REG_OK, 0, 10, 0, 5, END); + test_comp("([0-9]{4})\\1", REG_EXTENDED, 0); + test_exec("1234", 0, REG_NOMATCH); + test_exec("12341234", 0, REG_OK, 0, 8, 0, 4, END); + + /* + * Test minimal repetitions (non-greedy repetitions) + */ + avoid_eflags = REG_BACKTRACKING_MATCHER | REG_APPROX_MATCHER; + + /* Basic .*/ + test_comp(".*?", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 0, END); + test_comp(".+?", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 1, END); + test_comp(".??", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 0, END); + test_comp(".{2,5}?", REG_EXTENDED, 0); + test_exec("abcd", 0, REG_OK, 0, 2, END); + + /* More complicated. */ + test_comp("(.*?)", REG_EXTENDED, 0); + test_exec("text1text2", 0, REG_OK, 0, 12, 3, 8, END); + test_comp("a(.*?)(foo|bar|zap)", REG_EXTENDED, 0); + test_exec("hubba wooga-booga zabar gafoo wazap", 0, REG_OK, + 4, 23, 5, 20, 20, 23, END); + + /* Test REG_UNGREEDY. */ + test_comp(".*", REG_EXTENDED | REG_UNGREEDY, 0); + test_exec("abcd", 0, REG_OK, 0, 0, END); + test_comp(".*?", REG_EXTENDED | REG_UNGREEDY, 0); + test_exec("abcd", 0, REG_OK, 0, 4, END); + + avoid_eflags = 0; + + + /* + * Error reporting tests. + */ + + test_comp("\\", REG_EXTENDED, REG_EESCAPE); + test_comp("\\\\", REG_EXTENDED, REG_OK); + test_exec("\\", 0, REG_OK, 0, 1, END); + test_comp("(", REG_EXTENDED, REG_EPAREN); + test_comp("(aaa", REG_EXTENDED, REG_EPAREN); + test_comp(")", REG_EXTENDED, REG_OK); + test_exec(")", 0, REG_OK, 0, 1, END); + test_comp("a{1", REG_EXTENDED, REG_EBRACE); + test_comp("a{1,x}", REG_EXTENDED, REG_BADBR); + test_comp("a{1x}", REG_EXTENDED, REG_BADBR); + test_comp("a{1,0}", REG_EXTENDED, REG_BADBR); + test_comp("a{x}", REG_EXTENDED, REG_BADBR); + test_comp("a{}", REG_EXTENDED, REG_BADBR); + + + test_comp("\\", 0, REG_EESCAPE); + test_comp("\\(", 0, REG_EPAREN); + test_comp("\\)", 0, REG_EPAREN); + test_comp("a\\{1", 0, REG_EBRACE); + test_comp("a\\{1,x\\}", 0, REG_BADBR); + test_comp("a\\{1x\\}", 0, REG_BADBR); + test_comp("a\\{1,0\\}", 0, REG_BADBR); + test_comp("a\\{x\\}", 0, REG_BADBR); + test_comp("a\\{\\}", 0, REG_BADBR); + test_comp("a\\{1,256\\}", 0, REG_BADMAX); + + + test_comp(NULL, REG_BASIC, REG_OK); + test_comp(NULL, REG_EXTENDED, REG_OK); + + + /* + * Internationalization tests. + */ + + /* This same test with the correct locale is below. + TBR: This is a guess for the source encoding, see comments below after the locale is set to a Japanese locale. */ +#ifdef SRC_IN_EUC_JP + test_comp("+", REG_EXTENDED, 0); + test_exec("ξޤϡ", + 0, REG_OK, 10, 13, END); +#else +#ifdef SRC_IN_UTF_8 + /* iconv -f EUC_JP -t UTF-8 file_with_lines_above > zzz_utf_8 + This may be incorrect because the match results might be incorrect for UTF-8, I (TBR) just don't know enough to be certain. + It compiles and runs successfully on my desktop with the C.UTF-8 locale. */ + test_comp("機+", REG_EXTENDED, 0); + test_exec("この賞は、機・利便性・セキ", + 0, REG_OK, 15, 18, END); +#else + /* Represent the test strings as a sequence of bytes so we don't run afoul of the compiler's expected source-charset. */ + unsigned char str_001[] = { + 0xB5,0xA1,'+',0x00 + }; + unsigned char str_002[] = { + 0xA4,0xB3,0xA4,0xCE,0xBE,0xDE,0xA4,0xCF,0xA1,0xA2,0xB5,0xA1,0xA1,0xA6,0xCD,0xF8,0xCA,0xD8,0xC0,0xAD,0xA1,0xA6,0xA5,0xBB,0xA5,0xAD,0x00 + }; + test_comp((char const *)str_001, REG_EXTENDED, 0); + test_exec((char const *)str_002, 0, REG_OK, 10, 13, END); +#endif +#endif + +#if !defined(WIN32) && !defined(__OpenBSD__) + if (setlocale(LC_CTYPE, "en_US.ISO-8859-1") != NULL || + setlocale(LC_CTYPE, "en_US.ISO8859-1") != NULL) + { + fprintf(outf, "\nTesting LC_CTYPE en_US.ISO-8859-1\n"); +#ifdef SRC_IN_ISO_8859_1 + test_comp("aBCdeFghiJKlmnoPQRstuvWXyZ", REG_ICASE, 0); + test_exec("abCDefGhiJKlmNoPqRStuVwXyz", 0, REG_OK, 0, 29, END); +#else +#ifdef SRC_IN_UTF_8 + /* iconv -f ISO-8859-1 -t UTF-8 file_with_lines_above > yyy_utf_8 */ + /* This fails with no match on freebsd, but succeeds in linux. */ + test_comp("aBCdeFghiJKlmnoPQRstuvWXyZåäö", REG_ICASE, 0); + test_exec("abCDefGhiJKlmNoPqRStuVwXyzÅÄÖ", 0, REG_OK, 0, 29, END); +#else + /* Represent the test strings as a sequence of bytes so we don't run afoul of the compiler's expected source-charset. */ + unsigned char str_003[] = { + 'a','B','C','d','e','F','g','h','i','J','K','l','m','n','o','P','Q','R','s','t','u','v','W','X','y','Z',0xE5,0xE4,0xF6,0x00 + }; + unsigned char str_004[] = { + 'a','b','C','D','e','f','G','h','i','J','K','l','m','N','o','P','q','R','S','t','u','V','w','X','y','z',0xC5,0xC4,0xD6,0x00 + }; + test_comp((char const *)str_003, REG_ICASE, 0); + test_exec((char const *)str_004, 0, REG_OK, 0, 29, END); +#endif +#endif + } + +#ifdef TRE_MULTIBYTE + if (setlocale(LC_CTYPE, "ja_JP.eucjp") != NULL || + setlocale(LC_CTYPE, "ja_JP.eucJP") != NULL) + { + fprintf(outf, "\nTesting LC_CTYPE ja_JP.eucjp\n"); + /* I tried to make a test where implementations not aware of multibyte + character sets will fail. I have no idea what the japanese text here + means, I took it from http://www.ipsec.co.jp/. */ + /* TBR 2023/03/22: iconv has (at least) the following encoding names for Japanese: + EUC-JIS-2004 EUC-JISX0213 + EUC-JP-MS EUCJP-MS EUCJP-OPEN EUCJP-WIN EUCJPMS + EUC-JP CSEUCPKDFMTJAPANESE EUCJP IBM-EUCJP + ISO-2022-JP-1 ISO2022-JP1 + ISO-2022-JP-2 CSISO2022JP2 ISO2022-JP2 ISO-2022-JP-2004 ISO-2022-JP-3 ISO2022-JP2004 ISO2022-JP3 + ISO-2022-JP CSISO2022JP ISO2022-JP + Both iconv arguments of EUC-JP and EUC-JP-MS produced the converted strings below, + all the others I tried resulted in invalid characters. So guess at EUC-JP. + If anyone knows what the encoding actually was, feel free to let me know at tbr at acm dot org :). */ +#ifdef SRC_IN_EUC_JP + test_comp("+", REG_EXTENDED, 0); + test_exec("ξޤϡ", 0, REG_OK, 10, 12, END); +#else +#ifdef SRC_IN_UTF_8 + /* iconv -f EUC_JP -t UTF-8 file_with_lines_above > zzz_utf_8 + This may fail because the match results might be incorrect for UTF-8, I (TBR) just don't know enough to be certain. + It compiles and runs successfully on my desktop with the C.UTF-8 locale. */ + test_comp("機+", REG_EXTENDED, 0); + test_exec("この賞は、機・利便性・セキ", 0, REG_OK, 10, 12, END); +#else + /* Represent the test strings as a sequence of bytes so we don't run afoul of the compiler's expected source-charset. */ + /* This test uses the same strings (str_001 and str_002) as above, now with a Japanese locale. + NOTE THE DIFFERENCE IN MATCH RESULTS - (10,13) earlier with the default locale, and (10,12) here with the Japanese locale. */ + test_comp((char const *)str_001, REG_EXTENDED, 0); + test_exec((char const *)str_002, 0, REG_OK, 10, 12, END); +#endif +#endif + test_comp("a", REG_EXTENDED, 0); + test_nexec("foo\000bar", 7, 0, REG_OK, 5, 6, END); + test_comp("c$", REG_EXTENDED, 0); + test_exec("abc", 0, REG_OK, 2, 3, END); + } + else + { + fprintf(outf, "\nTRE_MULTIBYTE enabled, but skipping LC_CTYPE ja_JP.eucJP (locale unavailable)\n"); + } +#endif /* TRE_MULTIBYTE */ +#endif + + tre_regfree(&reobj); + + fprintf(outf, "\n"); + if (comp_errors || exec_errors) + fprintf(outf, "%d (%d + %d) out of %d tests FAILED!\n", + comp_errors + exec_errors, comp_errors, exec_errors, + comp_tests + exec_tests); + else + fprintf(outf, "All %d tests passed.\n", comp_tests + exec_tests); + + +#ifdef MALLOC_DEBUGGING + if (xmalloc_dump_leaks()) + return 1; +#endif /* MALLOC_DEBUGGING */ + + return comp_errors || exec_errors; +} + +/* EOF */ diff --git a/deps/tre/tests/test-literal-opt.c b/deps/tre/tests/test-literal-opt.c new file mode 100644 index 000000000..62853e07d --- /dev/null +++ b/deps/tre/tests/test-literal-opt.c @@ -0,0 +1,303 @@ +/* + test-literal-opt.c - Validate TRE literal optimization against the + generic matcher. + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include + +#include "tre-internal.h" + +#define PMATCH_SLOTS 4 +#define RC_ANY -9999 + +typedef struct { + const char *name; + const char *pattern; + size_t pattern_len; + int cflags; + const char *string; + size_t string_len; + int eflags; + int expected_rc; + tre_literal_opt_mode_t expected_mode; +} litopt_case_t; + +static void +init_pmatch(regmatch_t pmatch[], size_t count) +{ + size_t i; + + for (i = 0; i < count; i++) + { + pmatch[i].rm_so = 111; + pmatch[i].rm_eo = 222; + } +} + +static int +same_pmatch(const regmatch_t a[], const regmatch_t b[], size_t count) +{ + size_t i; + + for (i = 0; i < count; i++) + if (a[i].rm_so != b[i].rm_so || a[i].rm_eo != b[i].rm_eo) + return 0; + return 1; +} + +static int +pmatch_cleared(const regmatch_t pmatch[], size_t count) +{ + size_t i; + + for (i = 0; i < count; i++) + if (pmatch[i].rm_so != -1 || pmatch[i].rm_eo != -1) + return 0; + return 1; +} + +static int +run_case(const litopt_case_t *tc) +{ + regex_t preg; + tre_tnfa_t *tnfa; + regmatch_t fast[PMATCH_SLOTS], slow[PMATCH_SLOTS]; + tre_literal_opt_mode_t saved_mode; + char errbuf[256]; + int errcode, fast_rc, slow_rc; + + memset(&preg, 0, sizeof(preg)); + errcode = tre_regncompb(&preg, tc->pattern, tc->pattern_len, tc->cflags); + if (errcode != REG_OK) + { + tre_regerror(errcode, &preg, errbuf, sizeof(errbuf)); + fprintf(stderr, "%s: compile failed: %s\n", tc->name, errbuf); + return 1; + } + + tnfa = (tre_tnfa_t *)preg.value; + if (tnfa->literal_opt.mode != tc->expected_mode) + { + fprintf(stderr, "%s: optimizer mode %d, expected %d\n", + tc->name, (int)tnfa->literal_opt.mode, (int)tc->expected_mode); + tre_regfree(&preg); + return 1; + } + + init_pmatch(fast, PMATCH_SLOTS); + init_pmatch(slow, PMATCH_SLOTS); + + fast_rc = tre_regnexecb(&preg, tc->string, tc->string_len, + PMATCH_SLOTS, fast, tc->eflags); + + saved_mode = tnfa->literal_opt.mode; + tnfa->literal_opt.mode = TRE_LITERAL_OPT_NONE; + slow_rc = tre_regnexecb(&preg, tc->string, tc->string_len, + PMATCH_SLOTS, slow, tc->eflags); + tnfa->literal_opt.mode = saved_mode; + + if (fast_rc != slow_rc) + { + fprintf(stderr, "%s: fast rc %d, slow rc %d\n", + tc->name, fast_rc, slow_rc); + tre_regfree(&preg); + return 1; + } + + if (tc->expected_rc != RC_ANY && fast_rc != tc->expected_rc) + { + fprintf(stderr, "%s: rc %d, expected %d\n", + tc->name, fast_rc, tc->expected_rc); + tre_regfree(&preg); + return 1; + } + + if (!same_pmatch(fast, slow, PMATCH_SLOTS)) + { + fprintf(stderr, "%s: fast and slow pmatch differ\n", tc->name); + tre_regfree(&preg); + return 1; + } + + if ((tc->cflags & REG_NOSUB) && fast_rc == REG_OK + && !pmatch_cleared(fast, PMATCH_SLOTS)) + { + fprintf(stderr, "%s: REG_NOSUB match did not clear pmatch\n", tc->name); + tre_regfree(&preg); + return 1; + } + + tre_regfree(&preg); + return 0; +} + +int +main(void) +{ + static const char nonascii_pattern[] = { (char)0xc0, '|', (char)0xe0 }; + static const char nonascii_haystack[] = { 'x', (char)0xe0, 'y' }; + static const litopt_case_t cases[] = { + { + "contains basic", + "foo|bar|baz", + sizeof("foo|bar|baz") - 1, + REG_EXTENDED | REG_NOSUB, + "xxbaryy", + sizeof("xxbaryy") - 1, + 0, + REG_OK, + TRE_LITERAL_OPT_CONTAINS + }, + { + "contains ignores bol/eol flags", + "foo|bar|baz", + sizeof("foo|bar|baz") - 1, + REG_EXTENDED | REG_NOSUB, + "xxbaryy", + sizeof("xxbaryy") - 1, + REG_NOTBOL | REG_NOTEOL, + REG_OK, + TRE_LITERAL_OPT_CONTAINS + }, + { + "prefix basic", + "^(foo|bar|baz)", + sizeof("^(foo|bar|baz)") - 1, + REG_EXTENDED | REG_NOSUB, + "barrier", + sizeof("barrier") - 1, + 0, + REG_OK, + TRE_LITERAL_OPT_PREFIX + }, + { + "prefix respects REG_NOTBOL", + "^(foo|bar|baz)", + sizeof("^(foo|bar|baz)") - 1, + REG_EXTENDED | REG_NOSUB, + "barrier", + sizeof("barrier") - 1, + REG_NOTBOL, + REG_NOMATCH, + TRE_LITERAL_OPT_PREFIX + }, + { + "suffix basic", + "(foo|bar|baz)$", + sizeof("(foo|bar|baz)$") - 1, + REG_EXTENDED | REG_NOSUB, + "crowbar", + sizeof("crowbar") - 1, + 0, + REG_OK, + TRE_LITERAL_OPT_SUFFIX + }, + { + "suffix respects REG_NOTEOL", + "(foo|bar|baz)$", + sizeof("(foo|bar|baz)$") - 1, + REG_EXTENDED | REG_NOSUB, + "crowbar", + sizeof("crowbar") - 1, + REG_NOTEOL, + REG_NOMATCH, + TRE_LITERAL_OPT_SUFFIX + }, + { + "exact basic", + "^(foo|bar|baz)$", + sizeof("^(foo|bar|baz)$") - 1, + REG_EXTENDED | REG_NOSUB, + "bar", + sizeof("bar") - 1, + 0, + REG_OK, + TRE_LITERAL_OPT_EXACT + }, + { + "exact respects REG_NOTBOL", + "^(foo|bar|baz)$", + sizeof("^(foo|bar|baz)$") - 1, + REG_EXTENDED | REG_NOSUB, + "bar", + sizeof("bar") - 1, + REG_NOTBOL, + REG_NOMATCH, + TRE_LITERAL_OPT_EXACT + }, + { + "exact respects REG_NOTEOL", + "^(foo|bar|baz)$", + sizeof("^(foo|bar|baz)$") - 1, + REG_EXTENDED | REG_NOSUB, + "bar", + sizeof("bar") - 1, + REG_NOTEOL, + REG_NOMATCH, + TRE_LITERAL_OPT_EXACT + }, + { + "empty alternation disables optimization", + "(|foo|bar)", + sizeof("(|foo|bar)") - 1, + REG_EXTENDED | REG_NOSUB, + "", + 0, + 0, + REG_OK, + TRE_LITERAL_OPT_NONE + }, + { + "inline flag disable stays generic", + "foo(?-i:zap)zot", + sizeof("foo(?-i:zap)zot") - 1, + REG_EXTENDED | REG_ICASE | REG_NOSUB, + "FoOzApZOt", + sizeof("FoOzApZOt") - 1, + 0, + REG_NOMATCH, + TRE_LITERAL_OPT_NONE + }, + { + "inline flag disable still matches exact scoped bytes", + "foo(?-i:zap)zot", + sizeof("foo(?-i:zap)zot") - 1, + REG_EXTENDED | REG_ICASE | REG_NOSUB, + "FoOzapZOt", + sizeof("FoOzapZOt") - 1, + 0, + REG_OK, + TRE_LITERAL_OPT_NONE + }, + { + "nocase non-ascii bytes stay in sync", + nonascii_pattern, + sizeof(nonascii_pattern), + REG_EXTENDED | REG_ICASE | REG_NOSUB, + nonascii_haystack, + sizeof(nonascii_haystack), + 0, + RC_ANY, + TRE_LITERAL_OPT_CONTAINS + } + }; + size_t i; + int failures = 0; + + setlocale(LC_CTYPE, "en_US.ISO-8859-1"); + + for (i = 0; i < elementsof(cases); i++) + failures += run_case(&cases[i]); + + return failures; +} diff --git a/deps/tre/tests/test-malformed-regn.c b/deps/tre/tests/test-malformed-regn.c new file mode 100644 index 000000000..7d3074a1e --- /dev/null +++ b/deps/tre/tests/test-malformed-regn.c @@ -0,0 +1,85 @@ +/* + test-malformed-regn.c - Verify exact-length edge-case regexps compile or fail + cleanly both with and without a trailing NUL byte. + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. +*/ + +#include +#include +#include + +#include "tre.h" + +typedef struct { + const char *name; + const char *pattern; + int expected_err; +} malformed_case_t; + +static int +run_case(const malformed_case_t *tc, int nul_terminated) +{ + regex_t preg; + size_t len = strlen(tc->pattern); + size_t alloc_len = len + (nul_terminated ? 1 : 0); + char *pattern = malloc(alloc_len ? alloc_len : 1); + int errcode; + + if (pattern == NULL) + { + fprintf(stderr, "%s: out of memory\n", tc->name); + return 1; + } + + if (len > 0) + memcpy(pattern, tc->pattern, len); + if (nul_terminated) + pattern[len] = '\0'; + + memset(&preg, 0, sizeof(preg)); + errcode = tre_regncompb(&preg, pattern, len, REG_EXTENDED | REG_NOSUB); + if (errcode == REG_OK) + tre_regfree(&preg); + + free(pattern); + + if (errcode != tc->expected_err) + { + char errbuf[128]; + memset(&preg, 0, sizeof(preg)); + tre_regerror(errcode, &preg, errbuf, sizeof(errbuf)); + fprintf(stderr, "%s (%s): got %d (%s), expected %d\n", + tc->name, nul_terminated ? "nul" : "exact", + errcode, errbuf, tc->expected_err); + return 1; + } + + return 0; +} + +int +main(void) +{ + static const malformed_case_t cases[] = { + { "open paren", "(", REG_EPAREN }, + { "open bracket", "[", REG_EBRACK }, + { "unterminated comment", "(?#", REG_BADPAT }, + { "unterminated inline flags", "(?i", REG_BADPAT }, + { "short hex escape", "\\x", REG_OK }, + { "unterminated wide hex", "\\x{", REG_EBRACE }, + { "empty wide hex", "\\x{}", REG_OK } + }; + size_t i; + + for (i = 0; i < sizeof(cases) / sizeof(*cases); i++) + { + if (run_case(&cases[i], 0)) + return 1; + if (run_case(&cases[i], 1)) + return 1; + } + + return 0; +} diff --git a/deps/tre/tests/test-str-source.c b/deps/tre/tests/test-str-source.c new file mode 100644 index 000000000..985f5b247 --- /dev/null +++ b/deps/tre/tests/test-str-source.c @@ -0,0 +1,192 @@ +/* + test-str-source.c - Sample program for using tre_reguexec() + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +/* look for getopt in order to use a -o option for output. */ +#if defined(HAVE_UNISTD_H) +#include +#elif defined(HAVE_GETOPT_H) +#include +#endif + +#include "tre-internal.h" + +static FILE *outf = NULL; + +/* Context structure for the tre_str_source wrappers. */ +typedef struct { + /* Our string. */ + const char *str; + /* Current position in the string. */ + size_t pos; +} str_handler_ctx; + +/* The get_next_char() handler. Sets `c' to the value of the next character, + and increases `pos_add' by the number of bytes read. Returns 1 if the + string has ended, 0 if there are more characters. */ +static int +str_handler_get_next(tre_char_t *c, unsigned int *pos_add, void *context) +{ + str_handler_ctx *ctx = context; + unsigned char ch = ctx->str[ctx->pos]; + +#ifdef TRE_DEBUG + fprintf(outf, "str[%lu] = %d\n", (unsigned long)ctx->pos, ch); +#endif /* TRE_DEBUG */ + *c = ch; + if (ch) + ctx->pos++; + *pos_add = 1; + + return ch == '\0'; +} + +/* The rewind() handler. Resets the current position in the input string. */ +static void +str_handler_rewind(size_t pos, void *context) +{ + str_handler_ctx *ctx = context; + +#ifdef TRE_DEBUG + fprintf(outf, "rewind to %lu\n", (unsigned long)pos); +#endif /* TRE_DEBUG */ + ctx->pos = pos; +} + +/* The compare() handler. Compares two substrings in the input and returns + 0 if the substrings are equal, and a nonzero value if not. */ +static int +str_handler_compare(size_t pos1, size_t pos2, size_t len, void *context) +{ + str_handler_ctx *ctx = context; +#ifdef TRE_DEBUG + fprintf(outf, "comparing %lu-%lu and %lu-%lu\n", + (unsigned long)pos1, (unsigned long)pos1 + len, + (unsigned long)pos2, (unsigned long)pos2 + len); +#endif /* TRE_DEBUG */ + return strncmp(ctx->str + pos1, ctx->str + pos2, len); +} + +/* Creates a tre_str_source wrapper around the string `str'. Returns the + tre_str_source object or NULL if out of memory. */ +static tre_str_source * +make_str_source(const char *str) +{ + tre_str_source *s; + str_handler_ctx *ctx; + + s = calloc(1, sizeof(*s)); + if (!s) + return NULL; + + ctx = malloc(sizeof(str_handler_ctx)); + if (!ctx) + { + free(s); + return NULL; + } + + ctx->str = str; + ctx->pos = 0; + s->context = ctx; + s->get_next_char = str_handler_get_next; + s->rewind = str_handler_rewind; + s->compare = str_handler_compare; + + return s; +} + +/* Frees the memory allocated for `s'. */ +static void +free_str_source(tre_str_source *s) +{ + free(s->context); + free(s); +} + +/* Run one test with tre_reguexec. Returns 1 if the regex matches, 0 if + it doesn't, and -1 if an error occurs. */ +static int +test_reguexec(const char *str, const char *regex) +{ + regex_t preg; + tre_str_source *source; + regmatch_t pmatch[5]; + int ret; + + if ((source = make_str_source(str)) == NULL) + { + fprintf(stderr, "Out of memory\n"); + ret = -1; + } + else + { + if (tre_regcomp(&preg, regex, REG_EXTENDED) != REG_OK) + { + fprintf(stderr, "Failed to compile /%s/\n", regex); + ret = -1; + } + else + { + if (tre_reguexec(&preg, source, elementsof(pmatch), pmatch, 0) == 0) + { + fprintf(outf, "Match: /%s/ matches \"%.*s\" in \"%s\"\n", regex, + (int)(pmatch[0].rm_eo - pmatch[0].rm_so), + str + pmatch[0].rm_so, str); + ret = 1; + } + else + { + fprintf(outf, "No match: /%s/ in \"%s\"\n", regex, str); + ret = 0; + } + tre_regfree(&preg); + } + free_str_source(source); + } + return ret; +} + +int +main(int argc, char **argv) +{ + int ret = 0; + outf = stdout; +#if defined(HAVE_UNISTD_H) || defined(HAVE_GETOPT_H) + int opt; + while ((opt = getopt(argc, argv, "o:")) != EOF) + { + switch (opt) + { + case 'o': + if ((outf = fopen(optarg, "w")) == NULL) + { + perror(optarg); + exit(1); + } + break; + default: + /* getopt() will have printed an error message already */ + exit(1); + } + } +#endif + ret += test_reguexec("xfoofofoofoo", "(foo)\\1") != 1; + ret += test_reguexec("catcat", "(cat|dog)\\1") != 1; + ret += test_reguexec("catdog", "(cat|dog)\\1") != 0; + ret += test_reguexec("dogdog", "(cat|dog)\\1") != 1; + ret += test_reguexec("dogcat", "(cat|dog)\\1") != 0; + + return ret; +} diff --git a/redis.conf b/redis.conf index 79157b7d2..3688ae5e1 100644 --- a/redis.conf +++ b/redis.conf @@ -2044,6 +2044,7 @@ latency-monitor-threshold 0 # e Evicted events (events generated when a key is evicted for maxmemory) # n New key events (Note: not included in the 'A' class) # t Stream commands +# a Array commands # d Module key type events # m Key-miss events (Note: It is not included in the 'A' class) # o Overwritten events generated every time a key is overwritten. @@ -2057,7 +2058,7 @@ latency-monitor-threshold 0 # __subkeyspaceitem@__:\n prefix. # V Subkeyspaceevent events, published with # __subkeyspaceevent@__:| prefix. -# A Alias for g$lshzxetd, so that the "AKE" string means all the events +# A Alias for g$lshzxetad, so that the "AKE" string means all the events # except key-miss, new key, overwritten, type-changed and rate-limit. # # The "notify-keyspace-events" takes as argument a string that is composed @@ -2187,6 +2188,37 @@ stream-node-max-entries 100 # stream-idmp-duration 100 # stream-idmp-maxsize 100 +# Arrays use a sliced directory structure for O(1) access. The slice size +# controls the granularity of memory allocation - each slice covers a range +# of indices. Must be a power of two between 256 and 65536. +# +# Smaller slices (1024-2048): Better for sparse data with large gaps between +# indices, or many small arrays. Uses less memory per slice but more directory +# entries. +# +# Larger slices (8192-16384): Better for dense/contiguous data. Fewer directory +# entries but may waste memory if data is sparse within slices. +# +# Default 4096 works well for mixed workloads. If you change this setting via +# CONFIG SET, existing arrays retain their original slice size. +# +# IMPORTANT CONSIDERATION: Redis arrays, for slices with very few elements, are +# able to use a sparse representation, where the slice is not really +# materialized into an actual contiguous allocation. See the next configuration +# parameters for more information. +array-slice-size 4096 + +# Arrays start with sparse slices (sorted key-value pairs) for memory efficiency +# when elements are scattered. When a sparse slice exceeds array-sparse-kmax +# entries, it promotes to a dense slice (direct array). When a dense slice's +# element count drops below array-sparse-kmin and demotion would save memory, +# it demotes back to sparse. Set kmax to 0 to disable sparse encoding entirely. +# Set kmin to 0 if you never want dense slices to be demoted to sparse (useful +# when in your work load arrays reach an almost empty state to be filled again +# and so forth). +array-sparse-kmax 10 +array-sparse-kmin 5 + # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in # order to help rehashing the main Redis hash table (the one mapping top-level # keys to values). The hash table implementation Redis uses (see dict.c) diff --git a/src/Makefile b/src/Makefile index cf0395d1c..fea95efd4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -37,7 +37,7 @@ endif ifneq ($(OPTIMIZATION),-O0) OPTIMIZATION+=-fno-omit-frame-pointer endif -DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram fpconv xxhash +DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram fpconv xxhash tre NODEPS:=clean distclean # Default settings @@ -384,7 +384,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o gcra.o vector.o fast_float_strtod.o +REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_array.o sparsearray.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o gcra.o vector.o fast_float_strtod.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) @@ -444,7 +444,7 @@ endif # redis-server $(REDIS_SERVER_NAME): $(REDIS_SERVER_OBJ) $(REDIS_VEC_SETS_OBJ) - $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a ../deps/xxhash/libxxhash.a $(FINAL_LIBS) + $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a ../deps/xxhash/libxxhash.a ../deps/tre/libtre.a $(FINAL_LIBS) # redis-sentinel $(REDIS_SENTINEL_NAME): $(REDIS_SERVER_NAME) diff --git a/src/acl.c b/src/acl.c index 79a900200..95f749299 100644 --- a/src/acl.c +++ b/src/acl.c @@ -57,6 +57,7 @@ struct ACLCategoryItem { {"list", ACL_CATEGORY_LIST}, {"hash", ACL_CATEGORY_HASH}, {"string", ACL_CATEGORY_STRING}, + {"array", ACL_CATEGORY_ARRAY}, {"bitmap", ACL_CATEGORY_BITMAP}, {"hyperloglog", ACL_CATEGORY_HYPERLOGLOG}, {"geo", ACL_CATEGORY_GEO}, diff --git a/src/aof.c b/src/aof.c index a2bf945f2..8b6eb5709 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2515,6 +2515,116 @@ werr: return 0; } +/* Write unsigned 64-bit integer as bulk string. + * Unlike rioWriteBulkLongLong which uses signed representation, + * this correctly handles values >= 2^63 (e.g., array indices). */ +static int rioWriteBulkUnsignedLongLong(rio *r, uint64_t value) { + char buf[24]; + int len = ull2string(buf, sizeof(buf), value); + return rioWriteBulkString(r, buf, len); +} + +/* Helper to emit a single array element for AOF rewrite. + * Returns 0 on error, 1 on success. Updates count and items. */ +static int aofEmitArrayElement(rio *r, robj *key, uint64_t idx, void *v, + long long *count, long long *items) { + if (*count == 0) { + int cmd_items = (*items > AOF_REWRITE_ITEMS_PER_CMD/2) ? + AOF_REWRITE_ITEMS_PER_CMD/2 : *items; /* pairs of idx+val */ + if (!rioWriteBulkCount(r,'*',2+cmd_items*2) || + !rioWriteBulkString(r,"ARMSET",6) || + !rioWriteBulkObject(r,key)) + { + return 0; + } + } + + /* Write index (unsigned to handle indices >= 2^63) */ + if (!rioWriteBulkUnsignedLongLong(r, idx)) return 0; + + /* Write value - inline types use scratch space, arString aliases directly. */ + char buf[AR_INLINE_BUFSIZE]; + size_t len; + const char *data = arDecode(v, buf, sizeof(buf), &len); + if (!rioWriteBulkString(r, data, len)) return 0; + + if (++(*count) == AOF_REWRITE_ITEMS_PER_CMD/2) *count = 0; + (*items)--; + return 1; +} + +/* Helper to emit all elements from a slice for AOF rewrite. */ +static int aofEmitSliceElements(rio *r, robj *key, arSlice *s, uint64_t slice_id, + uint32_t slice_size, long long *count, long long *items) { + if (s->encoding == AR_SLICE_DENSE) { + for (uint32_t i = 0; i < s->layout.dense.winsize; i++) { + void *v = s->layout.dense.items[i]; + if (arIsEmpty(v)) continue; + uint64_t idx = arMakeIdx(slice_id, s->layout.dense.offset + i, slice_size); + if (!aofEmitArrayElement(r, key, idx, v, count, items)) return 0; + } + } else { + /* Sparse slice */ + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + for (uint32_t i = 0; i < s->count; i++) { + uint64_t idx = arMakeIdx(slice_id, offsets[i], slice_size); + if (!aofEmitArrayElement(r, key, idx, values[i], count, items)) return 0; + } + } + return 1; +} + +/* Emit the commands needed to rebuild an array object. + * The function returns 0 on error, 1 on success. */ +int rewriteArrayObject(rio *r, robj *key, robj *o) { + redisArray *ar = o->ptr; + long long count = 0, items = ar->count; + if (items == 0) return 1; + + /* Iterate through all slices, handling both flat directory mode and + * superdir mode. This mirrors the iteration logic in rdb.c. */ + if (ar->superdir) { + /* Superdir mode: iterate through blocks */ + for (uint32_t bi = 0; bi < ar->sdir_len; bi++) { + arSDirEntry *e = ar->superdir + bi; + uint64_t block_base = e->block_id * AR_SUPER_BLOCK_SLOTS; + + for (uint32_t si = 0; si < AR_SUPER_BLOCK_SLOTS; si++) { + arSlice *s = e->slots[si]; + if (!s) continue; + uint64_t slice_id = block_base + si; + if (!aofEmitSliceElements(r, key, s, slice_id, ar->slice_size, + &count, &items)) return 0; + } + } + } else { + /* Flat directory mode */ + for (uint64_t slice_id = 0; slice_id <= ar->dir_highest_used && slice_id < ar->dir_alloc; slice_id++) { + arSlice *s = ar->dir[slice_id]; + if (!s) continue; + if (!aofEmitSliceElements(r, key, s, slice_id, ar->slice_size, + &count, &items)) return 0; + } + } + + /* If insert_idx is set, emit ARSEEK command to restore it. + * When insert_idx == UINT64_MAX-1, we emit ARSEEK UINT64_MAX which + * correctly sets insert_idx back to UINT64_MAX-1 (terminal state). */ + if (ar->insert_idx != AR_INSERT_IDX_NONE) { + /* ARSEEK key insert_idx+1 (ARSEEK sets position for next insert) */ + if (!rioWriteBulkCount(r,'*',3) || + !rioWriteBulkString(r,"ARSEEK",6) || + !rioWriteBulkObject(r,key) || + !rioWriteBulkUnsignedLongLong(r, ar->insert_idx + 1)) + { + return 0; + } + } + + return 1; +} + int rewriteObject(rio *r, robj *key, robj *o, int dbid, long long expiretime) { /* Save the key and associated value */ if (o->type == OBJ_STRING) { @@ -2536,6 +2646,8 @@ int rewriteObject(rio *r, robj *key, robj *o, int dbid, long long expiretime) { if (rewriteStreamObject(r,key,o) == 0) return C_ERR; } else if (o->type == OBJ_GCRA) { if (rewriteGCRAObject(r,key,o) == 0) return C_ERR; + } else if (o->type == OBJ_ARRAY) { + if (rewriteArrayObject(r,key,o) == 0) return C_ERR; } else if (o->type == OBJ_MODULE) { if (rewriteModuleObject(r,key,o,dbid) == 0) return C_ERR; } else { diff --git a/src/commands.def b/src/commands.def index 7e4a14dc8..2726d1288 100644 --- a/src/commands.def +++ b/src/commands.def @@ -24,6 +24,7 @@ const char *COMMAND_GROUP_STR[] = { "geo", "stream", "bitmap", + "array", "module", "rate_limit" }; @@ -31,6 +32,535 @@ const char *COMMAND_GROUP_STR[] = { const char *commandGroupStr(int index) { return COMMAND_GROUP_STR[index]; } +/********** ARCOUNT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARCOUNT history */ +#define ARCOUNT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARCOUNT tips */ +#define ARCOUNT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARCOUNT key specs */ +keySpec ARCOUNT_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARCOUNT argument table */ +struct COMMAND_ARG ARCOUNT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/********** ARDEL ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARDEL history */ +#define ARDEL_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARDEL tips */ +#define ARDEL_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARDEL key specs */ +keySpec ARDEL_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_DELETE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARDEL argument table */ +struct COMMAND_ARG ARDEL_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("index",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** ARDELRANGE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARDELRANGE history */ +#define ARDELRANGE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARDELRANGE tips */ +#define ARDELRANGE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARDELRANGE key specs */ +keySpec ARDELRANGE_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_DELETE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARDELRANGE range argument table */ +struct COMMAND_ARG ARDELRANGE_range_Subargs[] = { +{MAKE_ARG("start",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("end",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARDELRANGE argument table */ +struct COMMAND_ARG ARDELRANGE_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("range",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,2,NULL),.subargs=ARDELRANGE_range_Subargs}, +}; + +/********** ARGET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARGET history */ +#define ARGET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARGET tips */ +#define ARGET_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARGET key specs */ +keySpec ARGET_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARGET argument table */ +struct COMMAND_ARG ARGET_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("index",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/********** ARGETRANGE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARGETRANGE history */ +#define ARGETRANGE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARGETRANGE tips */ +#define ARGETRANGE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARGETRANGE key specs */ +keySpec ARGETRANGE_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARGETRANGE argument table */ +struct COMMAND_ARG ARGETRANGE_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("start",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("end",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/********** ARGREP ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARGREP history */ +#define ARGREP_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARGREP tips */ +#define ARGREP_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARGREP key specs */ +keySpec ARGREP_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARGREP predicate exact argument table */ +struct COMMAND_ARG ARGREP_predicate_exact_Subargs[] = { +{MAKE_ARG("exact",ARG_TYPE_PURE_TOKEN,-1,"EXACT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("string",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARGREP predicate match argument table */ +struct COMMAND_ARG ARGREP_predicate_match_Subargs[] = { +{MAKE_ARG("match",ARG_TYPE_PURE_TOKEN,-1,"MATCH",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("string",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARGREP predicate glob argument table */ +struct COMMAND_ARG ARGREP_predicate_glob_Subargs[] = { +{MAKE_ARG("glob",ARG_TYPE_PURE_TOKEN,-1,"GLOB",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("pattern",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARGREP predicate re argument table */ +struct COMMAND_ARG ARGREP_predicate_re_Subargs[] = { +{MAKE_ARG("re",ARG_TYPE_PURE_TOKEN,-1,"RE",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("pattern",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARGREP predicate argument table */ +struct COMMAND_ARG ARGREP_predicate_Subargs[] = { +{MAKE_ARG("exact",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=ARGREP_predicate_exact_Subargs}, +{MAKE_ARG("match",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=ARGREP_predicate_match_Subargs}, +{MAKE_ARG("glob",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=ARGREP_predicate_glob_Subargs}, +{MAKE_ARG("re",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=ARGREP_predicate_re_Subargs}, +}; + +/* ARGREP options argument table */ +struct COMMAND_ARG ARGREP_options_Subargs[] = { +{MAKE_ARG("and",ARG_TYPE_PURE_TOKEN,-1,"AND",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("or",ARG_TYPE_PURE_TOKEN,-1,"OR",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("limit",ARG_TYPE_INTEGER,-1,"LIMIT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("withvalues",ARG_TYPE_PURE_TOKEN,-1,"WITHVALUES",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("nocase",ARG_TYPE_PURE_TOKEN,-1,"NOCASE",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARGREP argument table */ +struct COMMAND_ARG ARGREP_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("start",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("end",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("predicate",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,4,NULL),.subargs=ARGREP_predicate_Subargs}, +{MAKE_ARG("options",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,5,NULL),.subargs=ARGREP_options_Subargs}, +}; + +/********** ARINFO ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARINFO history */ +#define ARINFO_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARINFO tips */ +#define ARINFO_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARINFO key specs */ +keySpec ARINFO_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARINFO argument table */ +struct COMMAND_ARG ARINFO_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("full",ARG_TYPE_PURE_TOKEN,-1,"FULL",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, +}; + +/********** ARINSERT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARINSERT history */ +#define ARINSERT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARINSERT tips */ +#define ARINSERT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARINSERT key specs */ +keySpec ARINSERT_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARINSERT argument table */ +struct COMMAND_ARG ARINSERT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** ARLASTITEMS ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARLASTITEMS history */ +#define ARLASTITEMS_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARLASTITEMS tips */ +#define ARLASTITEMS_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARLASTITEMS key specs */ +keySpec ARLASTITEMS_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARLASTITEMS argument table */ +struct COMMAND_ARG ARLASTITEMS_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("count",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("rev",ARG_TYPE_PURE_TOKEN,-1,"REV",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, +}; + +/********** ARLEN ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARLEN history */ +#define ARLEN_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARLEN tips */ +#define ARLEN_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARLEN key specs */ +keySpec ARLEN_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARLEN argument table */ +struct COMMAND_ARG ARLEN_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/********** ARMGET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARMGET history */ +#define ARMGET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARMGET tips */ +#define ARMGET_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARMGET key specs */ +keySpec ARMGET_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARMGET argument table */ +struct COMMAND_ARG ARMGET_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("index",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** ARMSET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARMSET history */ +#define ARMSET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARMSET tips */ +#define ARMSET_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARMSET key specs */ +keySpec ARMSET_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARMSET data argument table */ +struct COMMAND_ARG ARMSET_data_Subargs[] = { +{MAKE_ARG("index",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* ARMSET argument table */ +struct COMMAND_ARG ARMSET_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("data",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,2,NULL),.subargs=ARMSET_data_Subargs}, +}; + +/********** ARNEXT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARNEXT history */ +#define ARNEXT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARNEXT tips */ +#define ARNEXT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARNEXT key specs */ +keySpec ARNEXT_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARNEXT argument table */ +struct COMMAND_ARG ARNEXT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/********** AROP ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* AROP history */ +#define AROP_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* AROP tips */ +#define AROP_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* AROP key specs */ +keySpec AROP_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* AROP operation match argument table */ +struct COMMAND_ARG AROP_operation_match_Subargs[] = { +{MAKE_ARG("match",ARG_TYPE_PURE_TOKEN,-1,"MATCH",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* AROP operation argument table */ +struct COMMAND_ARG AROP_operation_Subargs[] = { +{MAKE_ARG("sum",ARG_TYPE_PURE_TOKEN,-1,"SUM",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("min",ARG_TYPE_PURE_TOKEN,-1,"MIN",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("max",ARG_TYPE_PURE_TOKEN,-1,"MAX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("and",ARG_TYPE_PURE_TOKEN,-1,"AND",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("or",ARG_TYPE_PURE_TOKEN,-1,"OR",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xor",ARG_TYPE_PURE_TOKEN,-1,"XOR",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("match",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=AROP_operation_match_Subargs}, +{MAKE_ARG("used",ARG_TYPE_PURE_TOKEN,-1,"USED",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* AROP argument table */ +struct COMMAND_ARG AROP_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("start",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("end",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("operation",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,8,NULL),.subargs=AROP_operation_Subargs}, +}; + +/********** ARRING ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARRING history */ +#define ARRING_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARRING tips */ +#define ARRING_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARRING key specs */ +keySpec ARRING_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARRING argument table */ +struct COMMAND_ARG ARRING_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("size",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** ARSCAN ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARSCAN history */ +#define ARSCAN_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARSCAN tips */ +#define ARSCAN_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARSCAN key specs */ +keySpec ARSCAN_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARSCAN argument table */ +struct COMMAND_ARG ARSCAN_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("start",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("end",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("limit",ARG_TYPE_INTEGER,-1,"LIMIT",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, +}; + +/********** ARSEEK ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARSEEK history */ +#define ARSEEK_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARSEEK tips */ +#define ARSEEK_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARSEEK key specs */ +keySpec ARSEEK_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARSEEK argument table */ +struct COMMAND_ARG ARSEEK_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("index",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/********** ARSET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* ARSET history */ +#define ARSET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* ARSET tips */ +#define ARSET_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* ARSET key specs */ +keySpec ARSET_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* ARSET argument table */ +struct COMMAND_ARG ARSET_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("index",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + /********** BITCOUNT ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -11876,6 +12406,25 @@ struct COMMAND_ARG WATCH_Args[] = { /* Main command table */ struct COMMAND_STRUCT redisCommandTable[] = { +/* array */ +{MAKE_CMD("arcount","Returns the number of non-empty elements in an array.","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARCOUNT_History,0,ARCOUNT_Tips,0,arcountCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_ARRAY,ARCOUNT_Keyspecs,1,NULL,1),.args=ARCOUNT_Args}, +{MAKE_CMD("ardel","Deletes elements at the specified indices in an array.","O(N) where N is the number of indices to delete","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARDEL_History,0,ARDEL_Tips,0,ardelCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_ARRAY,ARDEL_Keyspecs,1,NULL,2),.args=ARDEL_Args}, +{MAKE_CMD("ardelrange","Deletes elements in one or more ranges.","Proportional to the number of existing elements / slices touched, not to the numeric span of the requested ranges","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARDELRANGE_History,0,ARDELRANGE_Tips,0,ardelrangeCommand,-4,CMD_WRITE,ACL_CATEGORY_ARRAY,ARDELRANGE_Keyspecs,1,NULL,2),.args=ARDELRANGE_Args}, +{MAKE_CMD("arget","Gets the value at an index in an array.","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARGET_History,0,ARGET_Tips,0,argetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_ARRAY,ARGET_Keyspecs,1,NULL,2),.args=ARGET_Args}, +{MAKE_CMD("argetrange","Gets values in a range of indices.","O(N) where N is the range length","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARGETRANGE_History,0,ARGETRANGE_Tips,0,argetrangeCommand,4,CMD_READONLY,ACL_CATEGORY_ARRAY,ARGETRANGE_Keyspecs,1,NULL,3),.args=ARGETRANGE_Args}, +{MAKE_CMD("argrep","Searches array elements in a range using textual predicates.","O(P * C) where P is the number of visited positions in touched slices and C is the cost of evaluating the predicates on one existing element.","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARGREP_History,0,ARGREP_Tips,0,argrepCommand,-6,CMD_READONLY,ACL_CATEGORY_ARRAY,ARGREP_Keyspecs,1,NULL,5),.args=ARGREP_Args}, +{MAKE_CMD("arinfo","Returns metadata about an array.","O(1), or O(N) with FULL option where N is the number of slices.","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARINFO_History,0,ARINFO_Tips,0,arinfoCommand,-2,CMD_READONLY,ACL_CATEGORY_ARRAY,ARINFO_Keyspecs,1,NULL,2),.args=ARINFO_Args}, +{MAKE_CMD("arinsert","Inserts one or more values at consecutive indices.","O(N) where N is the number of values","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARINSERT_History,0,ARINSERT_Tips,0,arinsertCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_ARRAY,ARINSERT_Keyspecs,1,NULL,2),.args=ARINSERT_Args}, +{MAKE_CMD("arlastitems","Returns the most recently inserted elements.","O(N) where N is the count","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARLASTITEMS_History,0,ARLASTITEMS_Tips,0,arlastitemsCommand,-3,CMD_READONLY,ACL_CATEGORY_ARRAY,ARLASTITEMS_Keyspecs,1,NULL,3),.args=ARLASTITEMS_Args}, +{MAKE_CMD("arlen","Returns the length of an array (max index + 1).","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARLEN_History,0,ARLEN_Tips,0,arlenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_ARRAY,ARLEN_Keyspecs,1,NULL,1),.args=ARLEN_Args}, +{MAKE_CMD("armget","Gets values at multiple indices in an array.","O(N) where N is the number of indices","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARMGET_History,0,ARMGET_Tips,0,armgetCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_ARRAY,ARMGET_Keyspecs,1,NULL,2),.args=ARMGET_Args}, +{MAKE_CMD("armset","Sets multiple index-value pairs in an array.","O(N) where N is the number of pairs","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARMSET_History,0,ARMSET_Tips,0,armsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_ARRAY,ARMSET_Keyspecs,1,NULL,2),.args=ARMSET_Args}, +{MAKE_CMD("arnext","Returns the next index ARINSERT would use.","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARNEXT_History,0,ARNEXT_Tips,0,arnextCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_ARRAY,ARNEXT_Keyspecs,1,NULL,1),.args=ARNEXT_Args}, +{MAKE_CMD("arop","Performs aggregate operations on array elements in a range.","O(P) where P is visited positions in touched slices (dense scanned slots + sparse entries), with worst-case O(|end-start|+1) and typical case close to O(N), where N is the number of existing elements in range.","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,AROP_History,0,AROP_Tips,0,aropCommand,-5,CMD_READONLY,ACL_CATEGORY_ARRAY,AROP_Keyspecs,1,NULL,4),.args=AROP_Args}, +{MAKE_CMD("arring","Inserts values into a ring buffer of specified size, wrapping and truncating as needed.","O(M) normally, O(N+M) on ring resize, where N is the maximum of the old and new ring size and M is the number of inserted values","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARRING_History,0,ARRING_Tips,0,arringCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_ARRAY,ARRING_Keyspecs,1,NULL,3),.args=ARRING_Args}, +{MAKE_CMD("arscan","Iterates existing elements in a range, returning index-value pairs.","O(P) where P is visited positions in touched slices (dense scanned slots + sparse entries), with worst-case O(|end-start|+1) and typical case close to O(N), where N is the number of existing elements in range.","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARSCAN_History,0,ARSCAN_Tips,0,arscanCommand,-4,CMD_READONLY,ACL_CATEGORY_ARRAY,ARSCAN_Keyspecs,1,NULL,4),.args=ARSCAN_Args}, +{MAKE_CMD("arseek","Sets the ARINSERT / ARRING cursor to a specific index.","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARSEEK_History,0,ARSEEK_Tips,0,arseekCommand,3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_ARRAY,ARSEEK_Keyspecs,1,NULL,2),.args=ARSEEK_Args}, +{MAKE_CMD("arset","Sets one or more contiguous values starting at an index in an array.","O(N) where N is the number of values","8.8.0",CMD_DOC_NONE,NULL,NULL,"array",COMMAND_GROUP_ARRAY,ARSET_History,0,ARSET_Tips,0,arsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_ARRAY,ARSET_Keyspecs,1,NULL,3),.args=ARSET_Args}, /* bitmap */ {MAKE_CMD("bitcount","Counts the number of set bits (population counting) in a string.","O(N)","2.6.0",CMD_DOC_NONE,NULL,NULL,"bitmap",COMMAND_GROUP_BITMAP,BITCOUNT_History,1,BITCOUNT_Tips,0,bitcountCommand,-2,CMD_READONLY,ACL_CATEGORY_BITMAP,BITCOUNT_Keyspecs,1,NULL,2),.args=BITCOUNT_Args}, {MAKE_CMD("bitfield","Performs arbitrary bitfield integer operations on strings.","O(1) for each subcommand specified","3.2.0",CMD_DOC_NONE,NULL,NULL,"bitmap",COMMAND_GROUP_BITMAP,BITFIELD_History,0,BITFIELD_Tips,0,bitfieldCommand,-2,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_BITMAP,BITFIELD_Keyspecs,1,bitfieldGetKeys,2),.args=BITFIELD_Args}, diff --git a/src/commands/arcount.json b/src/commands/arcount.json new file mode 100644 index 000000000..3452a6ec7 --- /dev/null +++ b/src/commands/arcount.json @@ -0,0 +1,48 @@ +{ + "ARCOUNT": { + "summary": "Returns the number of non-empty elements in an array.", + "complexity": "O(1)", + "group": "array", + "since": "8.8.0", + "arity": 2, + "function": "arcountCommand", + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "The number of non-empty elements, or 0 if key does not exist.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + } + ] + } +} diff --git a/src/commands/ardel.json b/src/commands/ardel.json new file mode 100644 index 000000000..e29d56181 --- /dev/null +++ b/src/commands/ardel.json @@ -0,0 +1,53 @@ +{ + "ARDEL": { + "summary": "Deletes elements at the specified indices in an array.", + "complexity": "O(N) where N is the number of indices to delete", + "group": "array", + "since": "8.8.0", + "arity": -3, + "function": "ardelCommand", + "command_flags": [ + "WRITE", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "DELETE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Number of elements deleted.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "index", + "type": "integer", + "multiple": true + } + ] + } +} diff --git a/src/commands/ardelrange.json b/src/commands/ardelrange.json new file mode 100644 index 000000000..0ed67ced9 --- /dev/null +++ b/src/commands/ardelrange.json @@ -0,0 +1,62 @@ +{ + "ARDELRANGE": { + "summary": "Deletes elements in one or more ranges.", + "complexity": "Proportional to the number of existing elements / slices touched, not to the numeric span of the requested ranges", + "group": "array", + "since": "8.8.0", + "arity": -4, + "function": "ardelrangeCommand", + "command_flags": [ + "WRITE" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "DELETE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Number of elements deleted.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "range", + "type": "block", + "multiple": true, + "arguments": [ + { + "name": "start", + "type": "integer" + }, + { + "name": "end", + "type": "integer" + } + ] + } + ] + } +} diff --git a/src/commands/arget.json b/src/commands/arget.json new file mode 100644 index 000000000..481bb4f66 --- /dev/null +++ b/src/commands/arget.json @@ -0,0 +1,60 @@ +{ + "ARGET": { + "summary": "Gets the value at an index in an array.", + "complexity": "O(1)", + "group": "array", + "since": "8.8.0", + "arity": 3, + "function": "argetCommand", + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "The value at the given index.", + "type": "string" + }, + { + "description": "Null reply if key or index does not exist.", + "type": "null" + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "index", + "type": "integer" + } + ] + } +} diff --git a/src/commands/argetrange.json b/src/commands/argetrange.json new file mode 100644 index 000000000..02d1fa6f0 --- /dev/null +++ b/src/commands/argetrange.json @@ -0,0 +1,64 @@ +{ + "ARGETRANGE": { + "summary": "Gets values in a range of indices.", + "complexity": "O(N) where N is the range length", + "group": "array", + "since": "8.8.0", + "arity": 4, + "function": "argetrangeCommand", + "command_flags": [ + "READONLY" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "start", + "type": "integer" + }, + { + "name": "end", + "type": "integer" + } + ] + } +} diff --git a/src/commands/argrep.json b/src/commands/argrep.json new file mode 100644 index 000000000..4ca4fa6f8 --- /dev/null +++ b/src/commands/argrep.json @@ -0,0 +1,182 @@ +{ + "ARGREP": { + "summary": "Searches array elements in a range using textual predicates.", + "complexity": "O(P * C) where P is the number of visited positions in touched slices and C is the cost of evaluating the predicates on one existing element.", + "group": "array", + "since": "8.8.0", + "arity": -6, + "function": "argrepCommand", + "command_flags": [ + "READONLY" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "anyOf": [ + { + "description": "Array of matching indexes.", + "type": "array", + "items": { + "type": "integer", + "description": "Index of a matching element" + } + }, + { + "description": "Array of [index, value] pairs. Returned in case `WITHVALUES` was used.", + "type": "array", + "items": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "description": "Index of a matching element" + }, + { + "type": "string", + "description": "Value at that index" + } + ] + } + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "start", + "type": "string" + }, + { + "name": "end", + "type": "string" + }, + { + "name": "predicate", + "type": "oneof", + "multiple": true, + "arguments": [ + { + "name": "exact", + "type": "block", + "arguments": [ + { + "name": "exact", + "type": "pure-token", + "token": "EXACT" + }, + { + "name": "string", + "type": "string" + } + ] + }, + { + "name": "match", + "type": "block", + "arguments": [ + { + "name": "match", + "type": "pure-token", + "token": "MATCH" + }, + { + "name": "string", + "type": "string" + } + ] + }, + { + "name": "glob", + "type": "block", + "arguments": [ + { + "name": "glob", + "type": "pure-token", + "token": "GLOB" + }, + { + "name": "pattern", + "type": "string" + } + ] + }, + { + "name": "re", + "type": "block", + "arguments": [ + { + "name": "re", + "type": "pure-token", + "token": "RE" + }, + { + "name": "pattern", + "type": "string" + } + ] + } + ] + }, + { + "name": "options", + "type": "oneof", + "optional": true, + "multiple": true, + "arguments": [ + { + "name": "and", + "type": "pure-token", + "token": "AND" + }, + { + "name": "or", + "type": "pure-token", + "token": "OR" + }, + { + "name": "limit", + "type": "integer", + "token": "LIMIT" + }, + { + "name": "withvalues", + "type": "pure-token", + "token": "WITHVALUES" + }, + { + "name": "nocase", + "type": "pure-token", + "token": "NOCASE" + } + ] + } + ] + } +} diff --git a/src/commands/arinfo.json b/src/commands/arinfo.json new file mode 100644 index 000000000..09b06ef10 --- /dev/null +++ b/src/commands/arinfo.json @@ -0,0 +1,103 @@ +{ + "ARINFO": { + "summary": "Returns metadata about an array.", + "complexity": "O(1), or O(N) with FULL option where N is the number of slices.", + "group": "array", + "since": "8.8.0", + "arity": -2, + "function": "arinfoCommand", + "command_flags": [ + "READONLY" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "type": "object", + "additionalProperties": false, + "properties": { + "count": { + "type": "integer", + "description": "Total number of non-empty elements." + }, + "len": { + "type": "integer", + "description": "Logical length (highest index + 1)." + }, + "next-insert-index": { + "type": "integer", + "description": "Index the next ARINSERT would use, or 0 if unset/exhausted." + }, + "slices": { + "type": "integer", + "description": "Number of allocated slices." + }, + "directory-size": { + "type": "integer", + "description": "Directory allocation capacity (flat dir_alloc or superdir sdir_cap)." + }, + "super-dir-entries": { + "type": "integer", + "description": "Number of super-directory entries (0 if not in superdir mode)." + }, + "slice-size": { + "type": "integer", + "description": "Configured slice size." + }, + "dense-slices": { + "type": "integer", + "description": "Number of dense slices (FULL only)." + }, + "sparse-slices": { + "type": "integer", + "description": "Number of sparse slices (FULL only)." + }, + "avg-dense-size": { + "type": "number", + "description": "Average allocation size of dense slices (FULL only)." + }, + "avg-dense-fill": { + "type": "number", + "description": "Average fill rate of dense slices (FULL only)." + }, + "avg-sparse-size": { + "type": "number", + "description": "Average capacity of sparse slices (FULL only)." + } + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "full", + "type": "pure-token", + "token": "FULL", + "optional": true + } + ] + } +} diff --git a/src/commands/arinsert.json b/src/commands/arinsert.json new file mode 100644 index 000000000..6b8c6ed76 --- /dev/null +++ b/src/commands/arinsert.json @@ -0,0 +1,54 @@ +{ + "ARINSERT": { + "summary": "Inserts one or more values at consecutive indices.", + "complexity": "O(N) where N is the number of values", + "group": "array", + "since": "8.8.0", + "arity": -3, + "function": "arinsertCommand", + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "The last index where a value was inserted.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "value", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/arlastitems.json b/src/commands/arlastitems.json new file mode 100644 index 000000000..ed888bf83 --- /dev/null +++ b/src/commands/arlastitems.json @@ -0,0 +1,66 @@ +{ + "ARLASTITEMS": { + "summary": "Returns the most recently inserted elements.", + "complexity": "O(N) where N is the count", + "group": "array", + "since": "8.8.0", + "arity": -3, + "function": "arlastitemsCommand", + "command_flags": [ + "READONLY" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "count", + "type": "integer" + }, + { + "name": "rev", + "type": "pure-token", + "token": "REV", + "optional": true + } + ] + } +} diff --git a/src/commands/arlen.json b/src/commands/arlen.json new file mode 100644 index 000000000..36143dfc7 --- /dev/null +++ b/src/commands/arlen.json @@ -0,0 +1,48 @@ +{ + "ARLEN": { + "summary": "Returns the length of an array (max index + 1).", + "complexity": "O(1)", + "group": "array", + "since": "8.8.0", + "arity": 2, + "function": "arlenCommand", + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "The length of the array (max index + 1), or 0 if key does not exist.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + } + ] + } +} diff --git a/src/commands/armget.json b/src/commands/armget.json new file mode 100644 index 000000000..f05023e03 --- /dev/null +++ b/src/commands/armget.json @@ -0,0 +1,62 @@ +{ + "ARMGET": { + "summary": "Gets values at multiple indices in an array.", + "complexity": "O(N) where N is the number of indices", + "group": "array", + "since": "8.8.0", + "arity": -3, + "function": "armgetCommand", + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "index", + "type": "integer", + "multiple": true + } + ] + } +} diff --git a/src/commands/armset.json b/src/commands/armset.json new file mode 100644 index 000000000..002f01bc2 --- /dev/null +++ b/src/commands/armset.json @@ -0,0 +1,64 @@ +{ + "ARMSET": { + "summary": "Sets multiple index-value pairs in an array.", + "complexity": "O(N) where N is the number of pairs", + "group": "array", + "since": "8.8.0", + "arity": -4, + "function": "armsetCommand", + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Number of new slots that were set (previously empty).", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "data", + "type": "block", + "multiple": true, + "arguments": [ + { + "name": "index", + "type": "integer" + }, + { + "name": "value", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/arnext.json b/src/commands/arnext.json new file mode 100644 index 000000000..f64b178d0 --- /dev/null +++ b/src/commands/arnext.json @@ -0,0 +1,56 @@ +{ + "ARNEXT": { + "summary": "Returns the next index ARINSERT would use.", + "complexity": "O(1)", + "group": "array", + "since": "8.8.0", + "arity": 2, + "function": "arnextCommand", + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "The next index ARINSERT would use. Returns 0 for missing keys or when no insert happened yet.", + "type": "integer" + }, + { + "description": "Null when the insertion cursor is exhausted (next insert would overflow).", + "type": "null" + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + } + ] + } +} diff --git a/src/commands/arop.json b/src/commands/arop.json new file mode 100644 index 000000000..eb18566bb --- /dev/null +++ b/src/commands/arop.json @@ -0,0 +1,123 @@ +{ + "AROP": { + "summary": "Performs aggregate operations on array elements in a range.", + "complexity": "O(P) where P is visited positions in touched slices (dense scanned slots + sparse entries), with worst-case O(|end-start|+1) and typical case close to O(N), where N is the number of existing elements in range.", + "group": "array", + "since": "8.8.0", + "arity": -5, + "function": "aropCommand", + "command_flags": [ + "READONLY" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Result of the operation.", + "type": "string" + }, + { + "description": "Integer result for MATCH, USED, AND, OR, XOR.", + "type": "integer" + }, + { + "description": "Null if no elements match the operation.", + "type": "null" + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "start", + "type": "integer" + }, + { + "name": "end", + "type": "integer" + }, + { + "name": "operation", + "type": "oneof", + "arguments": [ + { + "name": "sum", + "type": "pure-token", + "token": "SUM" + }, + { + "name": "min", + "type": "pure-token", + "token": "MIN" + }, + { + "name": "max", + "type": "pure-token", + "token": "MAX" + }, + { + "name": "and", + "type": "pure-token", + "token": "AND" + }, + { + "name": "or", + "type": "pure-token", + "token": "OR" + }, + { + "name": "xor", + "type": "pure-token", + "token": "XOR" + }, + { + "name": "match", + "type": "block", + "arguments": [ + { + "name": "match", + "type": "pure-token", + "token": "MATCH" + }, + { + "name": "value", + "type": "string" + } + ] + }, + { + "name": "used", + "type": "pure-token", + "token": "USED" + } + ] + } + ] + } +} diff --git a/src/commands/arring.json b/src/commands/arring.json new file mode 100644 index 000000000..01bddf7d7 --- /dev/null +++ b/src/commands/arring.json @@ -0,0 +1,57 @@ +{ + "ARRING": { + "summary": "Inserts values into a ring buffer of specified size, wrapping and truncating as needed.", + "complexity": "O(M) normally, O(N+M) on ring resize, where N is the maximum of the old and new ring size and M is the number of inserted values", + "group": "array", + "since": "8.8.0", + "arity": -4, + "function": "arringCommand", + "command_flags": [ + "WRITE", + "DENYOOM" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "The last index where a value was inserted.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "size", + "type": "integer" + }, + { + "name": "value", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/arscan.json b/src/commands/arscan.json new file mode 100644 index 000000000..3c75f3207 --- /dev/null +++ b/src/commands/arscan.json @@ -0,0 +1,76 @@ +{ + "ARSCAN": { + "summary": "Iterates existing elements in a range, returning index-value pairs.", + "complexity": "O(P) where P is visited positions in touched slices (dense scanned slots + sparse entries), with worst-case O(|end-start|+1) and typical case close to O(N), where N is the number of existing elements in range.", + "group": "array", + "since": "8.8.0", + "arity": -4, + "function": "arscanCommand", + "command_flags": [ + "READONLY" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of [index, value] pairs.", + "type": "array", + "items": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "description": "Index of existing element" + }, + { + "type": "string", + "description": "Value at that index" + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "start", + "type": "integer" + }, + { + "name": "end", + "type": "integer" + }, + { + "name": "limit", + "token": "LIMIT", + "type": "integer", + "optional": true + } + ] + } +} diff --git a/src/commands/arseek.json b/src/commands/arseek.json new file mode 100644 index 000000000..58904c77d --- /dev/null +++ b/src/commands/arseek.json @@ -0,0 +1,52 @@ +{ + "ARSEEK": { + "summary": "Sets the ARINSERT / ARRING cursor to a specific index.", + "complexity": "O(1)", + "group": "array", + "since": "8.8.0", + "arity": 3, + "function": "arseekCommand", + "command_flags": [ + "WRITE", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "1 if the cursor was set, 0 if the key does not exist.", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "index", + "type": "integer" + } + ] + } +} diff --git a/src/commands/arset.json b/src/commands/arset.json new file mode 100644 index 000000000..6d5e8453f --- /dev/null +++ b/src/commands/arset.json @@ -0,0 +1,58 @@ +{ + "ARSET": { + "summary": "Sets one or more contiguous values starting at an index in an array.", + "complexity": "O(N) where N is the number of values", + "group": "array", + "since": "8.8.0", + "arity": -4, + "function": "arsetCommand", + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "ARRAY" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Number of new slots that were set (previously empty).", + "type": "integer" + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "index", + "type": "integer" + }, + { + "name": "value", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/command-docs.json b/src/commands/command-docs.json index 5e76c806c..7648d7d51 100644 --- a/src/commands/command-docs.json +++ b/src/commands/command-docs.json @@ -59,6 +59,9 @@ { "const": "hyperloglog" }, + { + "const": "array" + }, { "const": "list" }, diff --git a/src/config.c b/src/config.c index 1320c8981..394d0fa01 100644 --- a/src/config.c +++ b/src/config.c @@ -2461,6 +2461,33 @@ static int isValidProcTitleTemplate(char *val, const char **err) { return 1; } +/* Validate that array-slice-size is a power of two */ +static int isValidArraySliceSize(long long val, const char **err) { + if (val <= 0 || (val & (val - 1)) != 0) { + *err = "array-slice-size must be a power of two"; + return 0; + } + return 1; +} + +/* Validate array-sparse-kmax: if non-zero, must be > kmin */ +static int isValidArraySparseKmax(long long val, const char **err) { + if (val > 0 && (unsigned int)val <= server.array_sparse_kmin) { + *err = "array-sparse-kmax must be greater than array-sparse-kmin when non-zero"; + return 0; + } + return 1; +} + +/* Validate array-sparse-kmin: must be < kmax when kmax is non-zero */ +static int isValidArraySparseKmin(long long val, const char **err) { + if (server.array_sparse_kmax > 0 && (unsigned int)val >= server.array_sparse_kmax) { + *err = "array-sparse-kmin must be less than array-sparse-kmax"; + return 0; + } + return 1; +} + static int updateLocaleCollate(const char **err) { const char *s = setlocale(LC_COLLATE, server.locale_collate); if (s == NULL) { @@ -3252,6 +3279,10 @@ standardConfig static_configs[] = { createUIntConfig("socket-mark-id", NULL, IMMUTABLE_CONFIG, 0, UINT_MAX, server.socket_mark_id, 0, INTEGER_CONFIG, NULL, NULL), createUIntConfig("max-new-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_conns_per_cycle, 10, INTEGER_CONFIG, NULL, NULL), createUIntConfig("max-new-tls-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_tls_conns_per_cycle, 1, INTEGER_CONFIG, NULL, NULL), + /* Array type configuration */ + createUIntConfig("array-slice-size", NULL, MODIFIABLE_CONFIG, AR_SLICE_SIZE_MIN, AR_SLICE_SIZE_MAX, server.array_slice_size, AR_SLICE_SIZE_DEFAULT, INTEGER_CONFIG, isValidArraySliceSize, NULL), + createUIntConfig("array-sparse-kmax", NULL, MODIFIABLE_CONFIG, 0, 256, server.array_sparse_kmax, AR_SPARSE_KMAX_DEFAULT, INTEGER_CONFIG, isValidArraySparseKmax, NULL), + createUIntConfig("array-sparse-kmin", NULL, MODIFIABLE_CONFIG, 0, 256, server.array_sparse_kmin, AR_SPARSE_KMIN_DEFAULT, INTEGER_CONFIG, isValidArraySparseKmin, NULL), #ifdef LOG_REQ_RES createUIntConfig("client-default-resp", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, 2, 3, server.client_default_resp, 2, INTEGER_CONFIG, NULL, NULL), #endif diff --git a/src/db.c b/src/db.c index 98197fbb4..e2dd50b50 100644 --- a/src/db.c +++ b/src/db.c @@ -1751,14 +1751,15 @@ int parseScanCursorOrReply(client *c, robj *o, unsigned long long *cursor) { } char *obj_type_name[OBJ_TYPE_MAX] = { - "string", - "list", - "set", - "zset", - "hash", + "string", + "list", + "set", + "zset", + "hash", NULL, /* module type is special */ "stream", - "gcra" + "gcra", + "array" }; /* Helper function to get type from a string in scan commands */ @@ -2438,6 +2439,7 @@ void copyCommand(client *c) { newobj = moduleTypeDupOrReply(c, key, newkey, dst->id, o); if (!newobj) return; break; + case OBJ_ARRAY: newobj = arrayTypeDup(o); break; default: addReplyError(c, "unknown type object"); return; diff --git a/src/debug.c b/src/debug.c index c6baf4b4d..9853d6c65 100644 --- a/src/debug.c +++ b/src/debug.c @@ -274,6 +274,21 @@ void xorObjectDigest(redisDb *db, robj *keyobj, unsigned char *digest, robj *o) mt->digest(&md,mv->value); xorDigest(digest,md.x,sizeof(md.x)); } + } else if (o->type == OBJ_ARRAY) { + redisArray *ar = o->ptr; + uint64_t len = arLen(ar); + for (uint64_t idx = 0; idx < len; idx++) { + void *v = arGet(ar, idx); + if (arIsEmpty(v)) { + /* For empty slots, contribute "(null)" */ + mixDigest(digest, "(null)", 6); + } else { + char vbuf[AR_INLINE_BUFSIZE]; + size_t vlen; + const char *data = arDecode(v, vbuf, sizeof(vbuf), &vlen); + mixDigest(digest, data, vlen); + } + } } else { serverPanic("Unknown object type"); } diff --git a/src/defrag.c b/src/defrag.c index 93a7389d9..010d4de23 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -754,6 +754,32 @@ void defragSet(defragKeysCtx *ctx, kvobj *ob) { ob->ptr = newd; } +/* Arrays can be expensive to defrag in one shot because they may contain many + * independently allocated slices. Small arrays are defragmented immediately, + * while large arrays are queued for later and processed one slice per step. */ +void defragArray(defragKeysCtx *ctx, kvobj *ob) { + serverAssert(ob->type == OBJ_ARRAY); + /* Maybe arCount() is not the best possible value to check against + * server.active_defrag_max_scan_fields, also because anyway when we + * defrag incrementally, we defrag a since slice per call. Yet it makes + * sense in a non very obvious way, for several reasons: + * + * 1. If the array is very sparse, it is an upper bound to the max + * number of slices it is composed to. + * 2. If the array is dense, we will scan in the default case at most 4096 + * entries, and the default defrag limit for max scans is 1000. They + * are kinda comparable numbers. + * 3. In case of a highly sparse array with huge indexes, in superdir mode, + * yet the super blocks are going to be at max arCount(). + * + * So regardless of the fact we later will defrag in slice units, this + * is a good trigger for the one shot or incremental selection. */ + if (arCount(ob->ptr) > server.active_defrag_max_scan_fields) + defragLater(ctx, ob); + else + ob->ptr = arDefrag(ob->ptr, activeDefragAlloc); +} + /* Defrag callback for radix tree iterator, called for each node, * used in order to defrag the nodes allocations. */ int defragRaxNode(raxNode **noderef, void *privdata) { @@ -1172,6 +1198,8 @@ void defragKey(defragKeysCtx *ctx, dictEntry *de, dictEntryLink link) { #endif } else if (ob->type == OBJ_MODULE) { defragModule(ctx,db, ob); + } else if (ob->type == OBJ_ARRAY) { + defragArray(ctx, ob); } else { serverPanic("Unknown object type"); } @@ -1288,6 +1316,10 @@ int defragLaterItem(kvobj *ob, unsigned long *cursor, monotime endtime, int dbid robj keyobj; initStaticStringObject(keyobj, kvobjGetKey(ob)); return moduleLateDefrag(&keyobj, ob, cursor, endtime, dbid); + } else if (ob->type == OBJ_ARRAY) { + redisArray *ar = ob->ptr; + *cursor = arDefragIncremental(&ar, *cursor, activeDefragAlloc); + ob->ptr = ar; } else { *cursor = 0; /* object type/encoding may have changed since we schedule it for later */ } diff --git a/src/hotkeys.c b/src/hotkeys.c index bdcc831e4..817a8c394 100644 --- a/src/hotkeys.c +++ b/src/hotkeys.c @@ -13,11 +13,6 @@ #include "cluster.h" #include -static inline int nearestNextPowerOf2(unsigned int count) { - if (count <= 1) return 1; - return 1 << (32 - __builtin_clz(count-1)); -} - /* Comparison function for qsort to sort slot indices */ static inline int slotCompare(const void *a, const void *b) { return (*(const int *)a) - (*(const int *)b); diff --git a/src/lazyfree.c b/src/lazyfree.c index 8d291bc9a..f9cde4e7e 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -207,6 +207,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { /* If the module's free_effort returns 0, we will use asynchronous free * memory by default. */ return effort == 0 ? ULONG_MAX : effort; + } else if (obj->type == OBJ_ARRAY) { + redisArray *ar = obj->ptr; + return arCount(ar); } else { return 1; /* Everything else is a single allocation. */ } diff --git a/src/module.c b/src/module.c index 5a6f510ac..0b1fb131d 100644 --- a/src/module.c +++ b/src/module.c @@ -4255,6 +4255,7 @@ int RM_KeyType(RedisModuleKey *key) { case OBJ_MODULE: return REDISMODULE_KEYTYPE_MODULE; case OBJ_STREAM: return REDISMODULE_KEYTYPE_STREAM; case OBJ_GCRA: return REDISMODULE_KEYTYPE_GCRA; + case OBJ_ARRAY: return REDISMODULE_KEYTYPE_ARRAY; default: return REDISMODULE_KEYTYPE_EMPTY; } } diff --git a/src/networking.c b/src/networking.c index 0030078e7..3bcd74e82 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1181,6 +1181,18 @@ void addReplyLongLongFromStr(client *c, robj *str) { addReplyProto(c,"\r\n",2); } +/* Reply with unsigned 64-bit value. Uses integer reply when value fits in + * signed long long, otherwise big number (RESP3) or bulk string (RESP2). */ +void addReplyUnsignedLongLong(client *c, uint64_t v) { + if (v <= (uint64_t)LLONG_MAX) { + addReplyLongLong(c, (long long)v); + } else { + char buf[LONG_STR_SIZE]; + int len = ull2string(buf, sizeof(buf), v); + addReplyBigNum(c, buf, len); + } +} + void addReplyAggregateLen(client *c, long length, int prefix) { serverAssert(length >= 0); if (_prepareClientToWrite(c) != C_OK) return; diff --git a/src/notify.c b/src/notify.c index 729865f0e..c8e884204 100644 --- a/src/notify.c +++ b/src/notify.c @@ -37,6 +37,7 @@ int keyspaceEventsStringToFlags(char *classes) { case 't': flags |= NOTIFY_STREAM; break; case 'm': flags |= NOTIFY_KEY_MISS; break; case 'd': flags |= NOTIFY_MODULE; break; + case 'a': flags |= NOTIFY_ARRAY; break; case 'n': flags |= NOTIFY_NEW; break; case 'o': flags |= NOTIFY_OVERWRITTEN; break; case 'c': flags |= NOTIFY_TYPE_CHANGED; break; @@ -72,6 +73,7 @@ sds keyspaceEventsFlagsToString(int flags) { if (flags & NOTIFY_EVICTED) res = sdscatlen(res,"e",1); if (flags & NOTIFY_STREAM) res = sdscatlen(res,"t",1); if (flags & NOTIFY_MODULE) res = sdscatlen(res,"d",1); + if (flags & NOTIFY_ARRAY) res = sdscatlen(res,"a",1); if (flags & NOTIFY_NEW) res = sdscatlen(res,"n",1); if (flags & NOTIFY_OVERWRITTEN) res = sdscatlen(res,"o",1); if (flags & NOTIFY_TYPE_CHANGED) res = sdscatlen(res,"c",1); diff --git a/src/object.c b/src/object.c index 44778014b..4ba1b4978 100644 --- a/src/object.c +++ b/src/object.c @@ -531,6 +531,13 @@ robj *createGCRAObject(long long value) { return o; } +robj *createArrayObject(void) { + redisArray *ar = arNew(); + robj *o = createObject(OBJ_ARRAY, ar); + o->encoding = OBJ_ENCODING_SLICED_ARRAY; + return o; +} + robj *createModuleObject(moduleType *mt, void *value) { moduleValue *mv = zmalloc(sizeof(*mv)); mv->type = mt; @@ -611,6 +618,10 @@ void freeGCRAObject(robj *o) { #endif } +void freeArrayObject(robj *o) { + arFree(o->ptr); +} + void incrRefCount(robj *o) { if (o->refcount < OBJ_FIRST_SPECIAL_REFCOUNT - 1) { o->refcount++; @@ -663,6 +674,7 @@ void decrRefCount(robj *o) { case OBJ_MODULE: freeModuleObject(o); break; case OBJ_STREAM: freeStreamObject(o); break; case OBJ_GCRA: freeGCRAObject(o); break; + case OBJ_ARRAY: freeArrayObject(o); break; default: serverPanic("Unknown object type"); break; } } @@ -810,6 +822,11 @@ void dismissStreamObject(robj *o, size_t size_hint) { } } +/* See dismissObject() */ +void dismissArrayObject(robj *o, size_t size_hint) { + arDismiss(o->ptr, size_hint); +} + void dismissGCRAObject(robj *o, size_t size_hint) { /* GCRA is a single allocation of a long long thus way smaller than a * page-size. The dismiss mechanism is not needed for it - hence NOOP.*/ @@ -846,6 +863,7 @@ void dismissObject(robj *o, size_t size_hint) { case OBJ_HASH: dismissHashObject(o, size_hint); break; case OBJ_STREAM: dismissStreamObject(o, size_hint); break; case OBJ_GCRA: dismissGCRAObject(o, size_hint); break; + case OBJ_ARRAY: dismissArrayObject(o, size_hint); break; default: break; } #else @@ -968,6 +986,7 @@ size_t getObjectLength(robj *o) { case OBJ_HASH: return hashTypeLength(o, 0); case OBJ_STREAM: return streamLength(o); case OBJ_GCRA: return gcraObjectLength(o); + case OBJ_ARRAY: return arCount(o->ptr); default: return 0; } } @@ -1265,6 +1284,7 @@ char *strEncoding(int encoding) { case OBJ_ENCODING_SKIPLIST: return "skiplist"; case OBJ_ENCODING_EMBSTR: return "embstr"; case OBJ_ENCODING_STREAM: return "stream"; + case OBJ_ENCODING_SLICED_ARRAY: return "sliced-array"; default: return "unknown"; } } @@ -1283,7 +1303,8 @@ size_t kvobjComputeSize(robj *key, kvobj *o, size_t sample_size, int dbid) { o->type == OBJ_ZSET || o->type == OBJ_HASH || o->type == OBJ_STREAM || - o->type == OBJ_GCRA) + o->type == OBJ_GCRA || + o->type == OBJ_ARRAY) { return kvobjAllocSize(o); } else if (o->type == OBJ_MODULE) { @@ -1311,6 +1332,9 @@ size_t kvobjAllocSize(kvobj *o) { asize += s->alloc_size; } else if (o->type == OBJ_GCRA) { asize += gcraTypeAllocSize(o); + } else if (o->type == OBJ_ARRAY) { + redisArray *ar = o->ptr; + asize += ar->alloc_size; } else if (o->type == OBJ_MODULE) { /* TODO: Provide moduleGetAllocSize() module API for O(1) allocation size retrieval */ } diff --git a/src/object.h b/src/object.h index 6b2591877..9fbf0f04e 100644 --- a/src/object.h +++ b/src/object.h @@ -85,6 +85,7 @@ struct RedisModuleType; #define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */ #define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */ #define OBJ_ENCODING_LISTPACK_EX 12 /* Encoded as listpack, extended with metadata */ +#define OBJ_ENCODING_SLICED_ARRAY 13 /* Encoded as sliced array */ #define LRU_BITS 24 #define LRU_CLOCK_MAX ((1<lru */ @@ -163,6 +164,7 @@ robj *createZsetListpackObject(void); robj *createStreamObject(void); robj *createGCRAObject(long long value); robj *createModuleObject(struct RedisModuleType *mt, void *value); +robj *createArrayObject(void); int getLongFromObjectOrReply(struct client *c, robj *o, long *target, const char *msg); int getPositiveLongFromObjectOrReply(struct client *c, robj *o, long *target, const char *msg); int getRangeLongFromObjectOrReply(struct client *c, robj *o, long min, long max, long *target, const char *msg); diff --git a/src/rdb.c b/src/rdb.c index e174e633a..d6fb3f3d2 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -124,33 +124,42 @@ time_t rdbLoadTime(rio *rdb) { return (time_t)t32; } -ssize_t rdbSaveMillisecondTime(rio *rdb, long long t) { - int64_t t64 = (int64_t) t; - memrev64ifbe(&t64); /* Store in little endian. */ - return rdbWriteRaw(rdb,&t64,8); +/* Save a signed 64-bit integer in little-endian format. */ +ssize_t rdbSaveSignedInteger(rio *rdb, int64_t val) { + memrev64ifbe(&val); /* Store in little endian. */ + return rdbWriteRaw(rdb, &val, 8); } -/* This function loads a time from the RDB file. It gets the version of the - * RDB because, unfortunately, before Redis 5 (RDB version 9), the function - * failed to convert data to/from little endian, so RDB files with keys having - * expires could not be shared between big endian and little endian systems - * (because the expire time will be totally wrong). The fix for this is just - * to call memrev64ifbe(), however if we fix this for all the RDB versions, +/* This function loads a signed 64-bit integer from the RDB file. It gets the + * version of the RDB because, unfortunately, before Redis 5 (RDB version 9), + * the function failed to convert data to/from little endian, so RDB files with + * keys having expires could not be shared between big endian and little endian + * systems (because the expire time will be totally wrong). The fix for this is + * just to call memrev64ifbe(), however if we fix this for all the RDB versions, * this call will introduce an incompatibility for big endian systems: * after upgrading to Redis version 5 they will no longer be able to load their * own old RDB files. Because of that, we instead fix the function only for new * RDB versions, and load older RDB versions as we used to do in the past, * allowing big endian systems to load their own old RDB files. * - * On I/O error the function returns LLONG_MAX, however if this is also a + * On I/O error the function returns INT64_MAX, however if this is also a * valid stored value, the caller should use rioGetReadError() to check for * errors after calling this function. */ -long long rdbLoadMillisecondTime(rio *rdb, int rdbver) { - int64_t t64; - if (rioRead(rdb,&t64,8) == 0) return LLONG_MAX; +int64_t rdbLoadSignedInteger(rio *rdb, int rdbver) { + int64_t val; + if (rioRead(rdb, &val, 8) == 0) return INT64_MAX; if (rdbver >= 9) /* Check the top comment of this function. */ - memrev64ifbe(&t64); /* Convert in big endian if the system is BE. */ - return (long long)t64; + memrev64ifbe(&val); /* Convert in big endian if the system is BE. */ + return val; +} + +/* Wrappers for millisecond time - these just call the signed integer functions */ +ssize_t rdbSaveMillisecondTime(rio *rdb, long long t) { + return rdbSaveSignedInteger(rdb, (int64_t)t); +} + +long long rdbLoadMillisecondTime(rio *rdb, int rdbver) { + return (long long)rdbLoadSignedInteger(rdb, rdbver); } /* Saves an encoded length. The first two bits in the first byte are used to @@ -717,6 +726,8 @@ int rdbSaveObjectType(rio *rdb, robj *o) { return rdbSaveType(rdb,RDB_TYPE_GCRA); case OBJ_MODULE: return rdbSaveType(rdb,RDB_TYPE_MODULE_2); + case OBJ_ARRAY: + return rdbSaveType(rdb,RDB_TYPE_ARRAY); default: serverPanic("Unknown object type"); } @@ -1039,6 +1050,68 @@ size_t rdbSaveStreamConsumers(rio *rdb, streamCG *cg) { /* Save a Redis object. * Returns -1 on error, number of bytes written on success. */ +static ssize_t rdbSaveArrayElement(rio *rdb, uint64_t idx, void *v) { + ssize_t n, nwritten = 0; + + if ((n = rdbSaveLen(rdb, idx)) == -1) return -1; + nwritten += n; + + if (arIsInt(v)) { + if ((n = rdbSaveLen(rdb, AR_RDB_TAG_INT)) == -1) return -1; + nwritten += n; + int64_t ival = arToInt(v); + if ((n = rdbSaveSignedInteger(rdb, ival)) == -1) return -1; + nwritten += n; + } else if (arIsFloat(v)) { + if ((n = rdbSaveLen(rdb, AR_RDB_TAG_FLOAT)) == -1) return -1; + nwritten += n; + double d = arToDouble(v); + if (rdbSaveBinaryDoubleValue(rdb, d) == -1) return -1; + nwritten += 8; + } else if (arIsSmallStr(v)) { + char buf[AR_SMALLSTR_MAXLEN + 1]; + int len = arToSmallStr(v, buf); + if ((n = rdbSaveLen(rdb, AR_RDB_TAG_SMALLSTR)) == -1) return -1; + nwritten += n; + if ((n = rdbSaveRawString(rdb, (unsigned char *)buf, len)) == -1) return -1; + nwritten += n; + } else { + if ((n = rdbSaveLen(rdb, AR_RDB_TAG_SDS)) == -1) return -1; + nwritten += n; + if ((n = rdbSaveRawString(rdb, (unsigned char *)arStringData(v), arStringLen(v))) == -1) return -1; + nwritten += n; + } + + return nwritten; +} + +static ssize_t rdbSaveArraySlice(rio *rdb, arSlice *s, uint64_t slice_id, + uint32_t slice_size) { + ssize_t n, nwritten = 0; + + if (s->encoding == AR_SLICE_DENSE) { + for (uint32_t i = 0; i < s->layout.dense.winsize; i++) { + void *v = s->layout.dense.items[i]; + if (arIsEmpty(v)) continue; + + uint64_t idx = arMakeIdx(slice_id, s->layout.dense.offset + i, slice_size); + if ((n = rdbSaveArrayElement(rdb, idx, v)) == -1) return -1; + nwritten += n; + } + } else { + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + + for (uint32_t i = 0; i < s->count; i++) { + uint64_t idx = arMakeIdx(slice_id, offsets[i], slice_size); + if ((n = rdbSaveArrayElement(rdb, idx, values[i])) == -1) return -1; + nwritten += n; + } + } + + return nwritten; +} + ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { ssize_t n = 0, nwritten = 0; @@ -1432,6 +1505,57 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { zfree(io.ctx); } return io.error ? -1 : (ssize_t)io.bytes; + } else if (o->type == OBJ_ARRAY) { + /* Save an array value. We persist only elements and insert_idx - no + * implementation details like slice_size. Arrays are loaded using + * the current ar_slice_size config. */ + redisArray *ar = o->ptr; + + /* Save count */ + if ((n = rdbSaveLen(rdb, ar->count)) == -1) return -1; + nwritten += n; + + /* Save insert_idx: 0 = none, 1 = has value followed by actual value. + * We can't save UINT64_MAX directly with rdbSaveLen/rdbLoadLen because + * rdbLoadLen returns UINT64_MAX (RDB_LENERR) to signal an error, making + * it impossible to distinguish a valid UINT64_MAX value from an error. */ + if (ar->insert_idx == AR_INSERT_IDX_NONE) { + if ((n = rdbSaveLen(rdb, 0)) == -1) return -1; + nwritten += n; + } else { + if ((n = rdbSaveLen(rdb, 1)) == -1) return -1; + nwritten += n; + if ((n = rdbSaveLen(rdb, ar->insert_idx)) == -1) return -1; + nwritten += n; + } + + /* Save elements in index order. + * We need to iterate through all slices, handling both flat directory + * mode and superdir mode. In superdir mode, blocks are sorted by + * block_id, so we iterate through blocks in order. */ + if (ar->superdir) { + /* Superdir mode: iterate through blocks */ + for (uint32_t bi = 0; bi < ar->sdir_len; bi++) { + arSDirEntry *e = ar->superdir + bi; + uint64_t block_base = e->block_id * AR_SUPER_BLOCK_SLOTS; + + for (uint32_t si = 0; si < AR_SUPER_BLOCK_SLOTS; si++) { + arSlice *s = e->slots[si]; + if (!s) continue; + uint64_t slice_id = block_base + si; + if ((n = rdbSaveArraySlice(rdb, s, slice_id, ar->slice_size)) == -1) return -1; + nwritten += n; + } + } + } else { + /* Flat directory mode */ + for (uint64_t slice_id = 0; slice_id <= ar->dir_highest_used && slice_id < ar->dir_alloc; slice_id++) { + arSlice *s = ar->dir[slice_id]; + if (!s) continue; + if ((n = rdbSaveArraySlice(rdb, s, slice_id, ar->slice_size)) == -1) return -1; + nwritten += n; + } + } } else { serverPanic("Unknown object type"); } @@ -3653,6 +3777,104 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) return NULL; } o = createGCRAObject((long long)time); + } else if (rdbtype == RDB_TYPE_ARRAY) { + /* Load array value. We only persist elements and insert_idx - no + * implementation details. Arrays use current ar_slice_size config. */ + uint64_t count; + if ((count = rdbLoadLen(rdb, NULL)) == RDB_LENERR) return NULL; + if (count == 0) { + rdbReportCorruptRDB("Empty array (count == 0) is invalid"); + return NULL; + } + + /* Load insert_idx: 0 = none, 1 = has value followed by actual value */ + uint64_t insert_idx_flag; + if ((insert_idx_flag = rdbLoadLen(rdb, NULL)) == RDB_LENERR) return NULL; + if (insert_idx_flag > 1) { + rdbReportCorruptRDB("Invalid array insert_idx_flag %llu", + (unsigned long long)insert_idx_flag); + return NULL; + } + uint64_t insert_idx; + if (insert_idx_flag == 0) { + insert_idx = AR_INSERT_IDX_NONE; + } else { + if ((insert_idx = rdbLoadLen(rdb, NULL)) == RDB_LENERR) return NULL; + } + + o = createArrayObject(); + redisArray *ar = o->ptr; + ar->insert_idx = insert_idx; + + /* Load elements */ + for (uint64_t i = 0; i < count; i++) { + uint64_t idx; + int idx_isencoded; + if (rdbLoadLenByRef(rdb, &idx_isencoded, &idx) == -1) { + decrRefCount(o); + return NULL; + } + if (idx_isencoded || idx == UINT64_MAX) { + decrRefCount(o); + rdbReportCorruptRDB("Invalid array index %llu", + (unsigned long long)idx); + return NULL; + } + + uint64_t type_tag; + if ((type_tag = rdbLoadLen(rdb, NULL)) == RDB_LENERR) { + decrRefCount(o); + return NULL; + } + + void *v; + if (type_tag == AR_RDB_TAG_INT) { + int64_t ival = rdbLoadSignedInteger(rdb, RDB_VERSION); + if (ival == INT64_MAX && rioGetReadError(rdb)) { + decrRefCount(o); + return NULL; + } + v = arValueFromRdbInt(ival); + } else if (type_tag == AR_RDB_TAG_FLOAT) { + double d; + if (rdbLoadBinaryDoubleValue(rdb, &d) == -1) { + decrRefCount(o); + return NULL; + } + v = arValueFromRdbFloat(d); + } else if (type_tag == AR_RDB_TAG_SMALLSTR) { + sds str; + if ((str = rdbGenericLoadStringObject(rdb, RDB_LOAD_SDS, NULL)) == NULL) { + decrRefCount(o); + return NULL; + } + size_t len = sdslen(str); + if (len > AR_SMALLSTR_MAXLEN) { + sdsfree(str); + decrRefCount(o); + rdbReportCorruptRDB("Invalid small string length %zu in array", len); + return NULL; + } + v = arValueFromRdbSmallStr(str, sdslen(str)); + sdsfree(str); + } else if (type_tag == AR_RDB_TAG_SDS) { + /* arString */ + sds str; + if ((str = rdbGenericLoadStringObject(rdb, RDB_LOAD_SDS, NULL)) == NULL) { + decrRefCount(o); + return NULL; + } + v = arEncode(str, sdslen(str)); + sdsfree(str); + } else { + decrRefCount(o); + rdbReportCorruptRDB("Unknown array element type_tag %llu", + (unsigned long long)type_tag); + return NULL; + } + + arSet(ar, idx, v); + } } else { rdbReportReadError("Unknown RDB encoding type %d",rdbtype); return NULL; diff --git a/src/rdb.h b/src/rdb.h index f1ea72150..159992dc0 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -81,10 +81,11 @@ #define RDB_TYPE_STREAM_LISTPACKS_4 26 /* Stream with IDMP support */ #define RDB_TYPE_STREAM_LISTPACKS_5 27 /* Stream with XNACK support (NACKed entries) */ #define RDB_TYPE_GCRA 28 /* GCRA object */ +#define RDB_TYPE_ARRAY 29 /* Array data type */ /* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType(), and rdb_type_string[] */ /* Test if a type is an object type. */ -#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 28)) +#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 29)) /* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */ #define RDB_OPCODE_KEY_META 243 /* Key metadata (module metadata classes). */ @@ -133,6 +134,8 @@ int rdbSaveType(rio *rdb, unsigned char type); int rdbLoadType(rio *rdb); time_t rdbLoadTime(rio *rdb); int rdbSaveLen(rio *rdb, uint64_t len); +ssize_t rdbSaveSignedInteger(rio *rdb, int64_t val); +int64_t rdbLoadSignedInteger(rio *rdb, int rdbver); ssize_t rdbSaveMillisecondTime(rio *rdb, long long t); long long rdbLoadMillisecondTime(rio *rdb, int rdbver); uint64_t rdbLoadLen(rio *rdb, int *isencoded); diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 1bbebb691..d3bbe4b40 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -89,6 +89,7 @@ char *rdb_type_string[] = { "stream-v4", "stream-v5", "gcra", + "array", }; /* Show a few stats collected into 'rdbstate' */ diff --git a/src/redismodule.h b/src/redismodule.h index fae09c3fb..d78b0e26d 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -90,6 +90,7 @@ typedef long long ustime_t; #define REDISMODULE_KEYTYPE_MODULE 6 #define REDISMODULE_KEYTYPE_STREAM 7 #define REDISMODULE_KEYTYPE_GCRA 8 +#define REDISMODULE_KEYTYPE_ARRAY 9 /* Reply types. */ #define REDISMODULE_REPLY_UNKNOWN -1 @@ -254,18 +255,19 @@ This flag should not be used directly by the module. #define REDISMODULE_NOTIFY_SUBKEYEVENT (1<<20) /* T */ #define REDISMODULE_NOTIFY_SUBKEYSPACEITEM (1<<21) /* I */ #define REDISMODULE_NOTIFY_SUBKEYSPACEEVENT (1<<22) /* V */ +#define REDISMODULE_NOTIFY_ARRAY (1<<23) /* a, array key space notification */ /* Next notification flag, must be updated when adding new flags above! This flag should not be used directly by the module. * Use RedisModule_GetKeyspaceNotificationFlagsAll instead. */ -#define _REDISMODULE_NOTIFY_NEXT (1<<23) +#define _REDISMODULE_NOTIFY_NEXT (1<<24) /* Delivery flags for RM_SubscribeToKeyspaceEventsWithSubkeys. * These are passed in the 'flags' parameter, not in 'types'. */ #define REDISMODULE_NOTIFY_FLAG_NONE 0 /* Invoke callback for all matching events */ #define REDISMODULE_NOTIFY_FLAG_SUBKEYS_REQUIRED (1<<0) /* Only invoke callback when subkeys are present */ -#define REDISMODULE_NOTIFY_ALL (REDISMODULE_NOTIFY_GENERIC | REDISMODULE_NOTIFY_STRING | REDISMODULE_NOTIFY_LIST | REDISMODULE_NOTIFY_SET | REDISMODULE_NOTIFY_HASH | REDISMODULE_NOTIFY_ZSET | REDISMODULE_NOTIFY_EXPIRED | REDISMODULE_NOTIFY_EVICTED | REDISMODULE_NOTIFY_STREAM | REDISMODULE_NOTIFY_MODULE) /* A */ +#define REDISMODULE_NOTIFY_ALL (REDISMODULE_NOTIFY_GENERIC | REDISMODULE_NOTIFY_STRING | REDISMODULE_NOTIFY_LIST | REDISMODULE_NOTIFY_SET | REDISMODULE_NOTIFY_HASH | REDISMODULE_NOTIFY_ZSET | REDISMODULE_NOTIFY_EXPIRED | REDISMODULE_NOTIFY_EVICTED | REDISMODULE_NOTIFY_STREAM | REDISMODULE_NOTIFY_MODULE | REDISMODULE_NOTIFY_ARRAY) /* A */ /* A special pointer that we can use between the core and the module to signal * field deletion, and that is impossible to be a valid pointer. */ diff --git a/src/server.h b/src/server.h index 6cecc6424..13d05ce0e 100644 --- a/src/server.h +++ b/src/server.h @@ -22,6 +22,7 @@ #include "atomicvar.h" #include "commands.h" #include "object.h" +#include "sparsearray.h" #include #include @@ -288,6 +289,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define ACL_CATEGORY_TRANSACTION (1ULL<<19) #define ACL_CATEGORY_SCRIPTING (1ULL<<20) #define ACL_CATEGORY_RATE_LIMIT (1ULL<<21) +#define ACL_CATEGORY_ARRAY (1ULL<<22) /* Key-spec flags * * -------------- */ @@ -801,7 +803,8 @@ typedef enum { #define NOTIFY_SUBKEYEVENT (1<<20) /* T, subkey-level keyevent notification */ #define NOTIFY_SUBKEYSPACEITEM (1<<21) /* I, subkey-level notification per item: channel=key\nsubkey */ #define NOTIFY_SUBKEYSPACEEVENT (1<<22) /* V, subkey-level notification: channel=event|key */ -#define NOTIFY_ALL (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | NOTIFY_EVICTED | NOTIFY_STREAM | NOTIFY_MODULE) /* A flag */ +#define NOTIFY_ARRAY (1<<23) /* a, array notification */ +#define NOTIFY_ALL (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | NOTIFY_EVICTED | NOTIFY_STREAM | NOTIFY_MODULE | NOTIFY_ARRAY) /* A flag */ /* Using the following macro you can run code inside serverCron() with the * specified period, specified in milliseconds. @@ -866,7 +869,8 @@ typedef enum { #define OBJ_MODULE 5 /* Module object. */ #define OBJ_STREAM 6 /* Stream object. */ #define OBJ_GCRA 7 /* GCRA object. */ -#define OBJ_TYPE_MAX 8 /* Maximum number of object types */ +#define OBJ_ARRAY 8 /* Array object. */ +#define OBJ_TYPE_MAX 9 /* Maximum number of object types */ /* NOTE: adding a new object requires changes in the following places: * - rdb.c - save/load (also bump RDB_VERSION if needed) @@ -2442,6 +2446,10 @@ struct redisServer { /* Stream IDMP parameters */ long long stream_idmp_duration; /* Default IDMP duration in seconds. */ long long stream_idmp_maxsize; /* Default IDMP max entries. */ + /* Array parameters */ + uint32_t array_slice_size; /* Slice size for new arrays */ + uint32_t array_sparse_kmax; /* Max elements before sparse->dense */ + uint32_t array_sparse_kmin; /* Min elements before dense->sparse */ /* List parameters */ int list_max_listpack_size; int list_compress_depth; @@ -2801,6 +2809,7 @@ typedef enum { COMMAND_GROUP_GEO, COMMAND_GROUP_STREAM, COMMAND_GROUP_BITMAP, + COMMAND_GROUP_ARRAY, COMMAND_GROUP_MODULE, COMMAND_GROUP_RATE_LIMIT, } redisCommandGroup; @@ -3213,6 +3222,7 @@ void addReplyBigNum(client *c, const char *num, size_t len); void addReplyHumanLongDouble(client *c, long double d); void addReplyLongLong(client *c, long long ll); void addReplyLongLongFromStr(client *c, robj* str); +void addReplyUnsignedLongLong(client *c, uint64_t v); void addReplyArrayLen(client *c, long length); void addReplyMapLen(client *c, long length); void addReplySetLen(client *c, long length); @@ -3844,6 +3854,9 @@ struct listpackEx *listpackExCreate(void); void listpackExAddNew(robj *o, char *field, size_t flen, char *value, size_t vlen, uint64_t expireAt); +/* Array data type. */ +robj *arrayTypeDup(robj *o); + /* Pub / Sub */ int pubsubUnsubscribeAllChannels(client *c, int notify); int pubsubUnsubscribeShardAllChannels(client *c, int notify); @@ -4511,6 +4524,26 @@ void digestCommand(client *c); void gcraCommand(client *c); void gcraSetValueCommand(client *c); +/* Array commands (t_array.c) */ +void arsetCommand(client *c); +void argetCommand(client *c); +void ardelCommand(client *c); +void ardelrangeCommand(client *c); +void arlenCommand(client *c); +void arcountCommand(client *c); +void argetrangeCommand(client *c); +void arscanCommand(client *c); +void argrepCommand(client *c); +void aropCommand(client *c); +void arinsertCommand(client *c); +void arringCommand(client *c); +void arnextCommand(client *c); +void arseekCommand(client *c); +void arlastitemsCommand(client *c); +void arinfoCommand(client *c); +void armsetCommand(client *c); +void armgetCommand(client *c); + #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); void free(void *ptr) __attribute__ ((deprecated)); diff --git a/src/sparsearray.c b/src/sparsearray.c new file mode 100644 index 000000000..d4945f2a7 --- /dev/null +++ b/src/sparsearray.c @@ -0,0 +1,2080 @@ +/* + * Copyright (c) 2026-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + * + * Sparse Array - A memory-efficient sparse array with 64-bit index space. + * Originally authored by: Salvatore Sanfilippo. + * + * This data structure was designed and implemented by Salvatore Sanfilippo. + */ + +#include "server.h" +#include +#include + +/****************************************************************************** + * SPARSE ARRAY IMPLEMENTATION + * + * Sparse arrays are random-access sequences indexed by non-negative 64-bit + * integers. They support O(1) get/set operations and efficient iteration. + * + * Arrays use tagged pointer-sized values. 64-bit builds inline more payload, + * while 32-bit builds use narrower immediate encodings and fall back to + * arString more often. SDS strings are not used as values since the final + * bits of SDS pointers are not guaranteed to be zero. + * + * See sparsearray.h for data structure documentation and inline helpers. + * + *****************************************************************************/ + +/* ---------------------------------------------------------------------------- + * Configuration - mapped to Redis server struct for easy standalone adaptation + * -------------------------------------------------------------------------- */ + +#define ArraySliceSize server.array_slice_size +#define ArraySparseKMax server.array_sparse_kmax +#define ArraySparseKMin server.array_sparse_kmin + +/* ---------------------------------------------------------------------------- + * Allocation size tracking + * + * Every zmalloc/zfree/zrealloc that contributes to the array's footprint is + * tracked in ar->alloc_size so that kvobjAllocSize() can return an O(1) + * answer. When ar is NULL (e.g. during arFree) tracking is skipped. + * -------------------------------------------------------------------------- */ + +static inline void *arAllocAndTrack(redisArray *ar, size_t size) { + size_t usable; + void *ptr = zmalloc_usable(size, &usable); + if (ar) ar->alloc_size += usable; + return ptr; +} +static inline void *arCallocAndTrack(redisArray *ar, size_t size) { + size_t usable; + void *ptr = zcalloc_usable(size, &usable); + if (ar) ar->alloc_size += usable; + return ptr; +} +static inline void arFreeAndTrack(redisArray *ar, void *ptr) { + size_t usable; + zfree_usable(ptr, &usable); + if (ar) ar->alloc_size -= usable; +} +static inline void *arReallocAndTrack(redisArray *ar, void *ptr, size_t size) { + size_t usable, old_usable; + void *newptr = zrealloc_usable(ptr, size, &usable, &old_usable); + if (ar) ar->alloc_size += usable - old_usable; + return newptr; +} + +/* Track a tagged value entering/leaving the array (arString bookkeeping). */ +static inline void arTrackValueIn(redisArray *ar, void *v) { + if (ar && arIsPtr(v)) ar->alloc_size += zmalloc_size(v); +} +static inline void arTrackValueOut(redisArray *ar, void *v) { + if (ar && arIsPtr(v)) ar->alloc_size -= zmalloc_size(v); +} + +/* ---------------------------------------------------------------------------- + * Internal helpers + * -------------------------------------------------------------------------- */ + +static inline size_t arStringHeaderSize(size_t len) { + return (len <= 32767) ? 2 : 8; +} + +size_t arStringLen(const void *ptr) { + const uint8_t *p = (const uint8_t *)ptr; + if (p[0] & 0x80) { + return ((size_t)(p[0] & 0x7F) << 8) | p[1]; + } else { + size_t len = 0; + for (int i = 0; i < 8; i++) len = (len << 8) | p[i]; + return len; + } +} + +const char *arStringData(const void *ptr) { + const uint8_t *p = (const uint8_t *)ptr; + return (const char *)(p + ((p[0] & 0x80) ? 2 : 8)); +} + +static inline size_t arSparseAllocSize(uint32_t cap) { + size_t offsets_size = cap * sizeof(uint16_t); + size_t padding = (sizeof(void *) - (offsets_size % sizeof(void *))) % sizeof(void *); + return sizeof(arSlice) + offsets_size + padding + cap * sizeof(void *); +} + +static inline size_t arDenseAllocSize(uint32_t winsize) { + return sizeof(arSlice) + winsize * sizeof(void *); +} + +static inline uint32_t arSliceMaxIdx(arSlice *s) { + if (s->encoding == AR_SLICE_DENSE) { + return s->layout.dense.max_idx; + } else { + return s->layout.sparse.offsets[s->count - 1]; + } +} + +/* ---------------------------------------------------------------------------- + * arString type + * -------------------------------------------------------------------------- */ + +/* Allocate a new arString with the given content. + * + * We use arString instead of SDS because SDS pointers are not guaranteed to + * have the low bits zero (SDS points inside an allocation, after the header). + * Our tagged pointer scheme needs tag 00 for heap strings, so we need aligned + * pointers. zmalloc guarantees sufficient alignment. + * + * arString has two header formats: + * + * 1. Short header (2 bytes): lengths up to 32767 bytes. + * The top bit of the first byte is set, and the remaining 15 bits store + * the length in big-endian form. + * + * +--------+--------+-------------------+ + * |1LLLLLLL|LLLLLLLL| payload | + * +--------+--------+-------------------+ + * byte 0 byte 1 + * + * 2. Long header (8 bytes): lengths up to 2^63-1 bytes. + * The top bit of the first byte is clear, and the remaining 63 bits store + * the length in big-endian form. + * + * +--------+--------+--------+--------+--------+--------+--------+--// + * |0LLLLLLL|LLLLLLLL|LLLLLLLL|LLLLLLLL|LLLLLLLL|LLLLLLLL|LLLLLLLL| + * +--------+--------+--------+--------+--------+--------+--------+--// + * byte 0 byte 1 byte 2 byte 3 byte 4 byte 5 byte 6 + * + * //--+--------+-------------------+ + * |LLLLLLLL| payload | + * //--+--------+-------------------+ + * byte 7 + * + * For simplicity we use a 63 bit len even when Redis is compiled with a 32 + * bit target, the overhead for strings > 32k is small. + * + * So the pointer returned by arStringNew() always points to the start of the + * header, and the string data begins immediately after the 2-byte or 8-byte + * header. */ +void *arStringNew(const char *s, size_t len) { + /* Length is stored in 63 bits; reject >= 2^63 to avoid + * hypothetical header corruption. On 32 bit builds this is guaranteed + * by size_t itself, so don't compile an always-true assertion. */ +#if SIZE_MAX > UINT32_MAX + serverAssert(len < ((size_t)1 << 63)); +#endif + size_t hdr_size = arStringHeaderSize(len); + uint8_t *ptr = zmalloc(hdr_size + len); + + if (hdr_size == 2) { + /* Short header: MSB=1, 15-bit length */ + ptr[0] = 0x80 | ((len >> 8) & 0x7F); + ptr[1] = len & 0xFF; + } else { + /* Long header: MSB=0, 63-bit length in big-endian */ + for (int i = 7; i >= 0; i--) { + ptr[7 - i] = (len >> (i * 8)) & 0xFF; + } + } + + memcpy(ptr + hdr_size, s, len); + return ptr; +} + +/* Free arString pointer */ +void arStringFree(void *ptr) { + zfree(ptr); +} + +/* Duplicate an arString */ +void *arStringDup(void *ptr) { + size_t len = arStringLen(ptr); + size_t hdr_size = arStringHeaderSize(len); + size_t total = hdr_size + len; + void *dup = zmalloc(total); + memcpy(dup, ptr, total); + return dup; +} + +/* Free arString if value is pointer-tagged, otherwise nothing to + * free, the info is encoded in the pointer itself. */ +void arFreePtr(void *v) { + if (arIsPtr(v)) { + arStringFree(v); + } +} + +/* ---------------------------------------------------------------------------- + * Slice allocation and management + * -------------------------------------------------------------------------- */ + +/* Create a new dense slice with given rel_idx (index relative to slice base) */ +arSlice *arSliceDenseNew(redisArray *ar, uint32_t rel_idx, uint32_t slice_size) { + uint32_t winsize = AR_SLICE_MIN_ALLOC; + uint32_t offset = rel_idx; + + /* Adjust offset if the initial window would extend past the slice + * boundary. For example, with slice size 4096 (the default), creating + * the slice around relative index 4093 needs the window shifted left. */ + if (offset + winsize > slice_size) { + offset = slice_size - winsize; + } + + arSlice *s = arAllocAndTrack(ar, arDenseAllocSize(winsize)); + s->encoding = AR_SLICE_DENSE; + s->count = 0; + s->layout.dense.offset = offset; + s->layout.dense.winsize = winsize; + s->layout.dense.max_idx = 0; + s->layout.dense.items = (void **)(s + 1); /* Payload starts after struct */ + memset(s->layout.dense.items, 0, winsize * sizeof(void *)); + return s; +} + +/* Sparse slices are a single allocation: the arSlice struct followed by + * a payload containing offsets[] and values[]. This function computes + * where these arrays live in the payload and sets the pointers accordingly. + * Must be called after zmalloc or memcpy, since copied slices have stale + * pointers that still reference the source allocation's memory. The values + * array requires pointer alignment, hence the padding after offsets[]. */ +void arSparseSetupPointers(arSlice *s) { + char *p = (char *)(s + 1); + size_t offsets_size = s->layout.sparse.cap * sizeof(uint16_t); + size_t padding = (sizeof(void *) - (offsets_size % sizeof(void *))) % sizeof(void *); + s->layout.sparse.offsets = (uint16_t *)p; + s->layout.sparse.values = (void **)(p + offsets_size + padding); +} + +/* Create a new sparse slice */ +arSlice *arSliceSparseNew(redisArray *ar) { + uint32_t cap = (ArraySparseKMax < 4) ? ArraySparseKMax : 4; + arSlice *s = arAllocAndTrack(ar, arSparseAllocSize(cap)); + s->encoding = AR_SLICE_SPARSE; + s->count = 0; + s->layout.sparse.cap = cap; + arSparseSetupPointers(s); + return s; +} + +/* Free a slice (including all arString values inside). + * When ar is non-NULL, deducts the memory from ar->alloc_size. + * Pass NULL for ar when the entire array is being destroyed (arFree). */ +void arSliceFree(redisArray *ar, arSlice *s) { + if (!s) return; + + if (s->encoding == AR_SLICE_DENSE) { + for (uint32_t i = 0; i < s->layout.dense.winsize; i++) { + arTrackValueOut(ar, s->layout.dense.items[i]); + arFreePtr(s->layout.dense.items[i]); + } + } else { + void **values = s->layout.sparse.values; + for (uint32_t i = 0; i < s->count; i++) { + arTrackValueOut(ar, values[i]); + arFreePtr(values[i]); + } + } + arFreeAndTrack(ar, s); +} + +/* Grow dense slice to accommodate rel_idx (right growth) */ +arSlice *arSliceDenseGrowRight(redisArray *ar, arSlice *s, uint32_t rel_idx, uint32_t slice_size) { + uint32_t new_winsize = s->layout.dense.winsize; + + /* Double until rel_idx fits */ + while (rel_idx >= s->layout.dense.offset + new_winsize && new_winsize < slice_size) { + new_winsize <<= 1; + } + + uint32_t new_offset = s->layout.dense.offset; + if (new_winsize >= slice_size) { + new_winsize = slice_size; + new_offset = 0; + } else if (new_offset + new_winsize > slice_size) { + /* Window would exceed slice boundary, adjust offset */ + new_offset = slice_size - new_winsize; + } + + /* Fast path: when offset does not move, we can use realloc() to grow + * the dense allocation without relocating existing items ourselves. */ + if (new_offset == s->layout.dense.offset) { + uint32_t old_winsize = s->layout.dense.winsize; + arSlice *ns = arReallocAndTrack(ar, s, arDenseAllocSize(new_winsize)); + ns->layout.dense.winsize = new_winsize; + ns->layout.dense.items = (void **)(ns + 1); + + /* New tail must be explicitly zeroed for arIsEmpty() semantics. */ + memset(ns->layout.dense.items + old_winsize, 0, + (new_winsize - old_winsize) * sizeof(void *)); + return ns; + } + + /* Data copy path: offset moved, so we allocate a new slice and copy. */ + arSlice *ns = arAllocAndTrack(ar, arDenseAllocSize(new_winsize)); + ns->encoding = AR_SLICE_DENSE; + ns->count = s->count; + ns->layout.dense.offset = new_offset; + ns->layout.dense.winsize = new_winsize; + ns->layout.dense.max_idx = s->layout.dense.max_idx; + ns->layout.dense.items = (void **)(ns + 1); + + /* Zero-fill first to ensure arIsEmpty() works for new slots, then + * copy old data */ + memset(ns->layout.dense.items, 0, new_winsize * sizeof(void *)); + uint32_t shift = s->layout.dense.offset - new_offset; + serverAssert(shift + s->layout.dense.winsize <= new_winsize); + memcpy(ns->layout.dense.items + shift, s->layout.dense.items, s->layout.dense.winsize * sizeof(void *)); + + arFreeAndTrack(ar, s); + return ns; +} + +/* Grow dense slice to accommodate rel_idx (left growth with slack). + * Note that in this case no realloc() optimization is possible, still + * we can grow on the left more than needed (next power of two) so if + * there is a right-to-left access pattern we can cope. */ +arSlice *arSliceDenseGrowLeft(redisArray *ar, arSlice *s, uint32_t rel_idx, uint32_t slice_size) { + uint32_t old_end = s->layout.dense.offset + s->layout.dense.winsize; + uint32_t need = old_end - rel_idx; + + /* Find next power of two that fits */ + uint32_t new_winsize = nearestNextPowerOf2(need); + if (new_winsize < AR_SLICE_MIN_ALLOC) new_winsize = AR_SLICE_MIN_ALLOC; + if (new_winsize > slice_size) new_winsize = slice_size; + + /* Position the window so that the old data is right-aligned (leaving + * slack on left) */ + int32_t new_offset = (int32_t)old_end - (int32_t)new_winsize; + if (new_offset < 0) new_offset = 0; + if (new_winsize == slice_size) new_offset = 0; + + arSlice *ns = arAllocAndTrack(ar, arDenseAllocSize(new_winsize)); + ns->encoding = AR_SLICE_DENSE; + ns->count = s->count; + ns->layout.dense.offset = (uint32_t)new_offset; + ns->layout.dense.winsize = new_winsize; + ns->layout.dense.max_idx = s->layout.dense.max_idx; + ns->layout.dense.items = (void **)(ns + 1); + + /* Zero-fill for arIsEmpty() semantics, then copy old data right-aligned */ + memset(ns->layout.dense.items, 0, new_winsize * sizeof(void *)); + uint32_t shift = s->layout.dense.offset - ns->layout.dense.offset; + serverAssert(shift + s->layout.dense.winsize <= new_winsize); + memcpy(ns->layout.dense.items + shift, s->layout.dense.items, s->layout.dense.winsize * sizeof(void *)); + + arFreeAndTrack(ar, s); + return ns; +} + +/* Grow dense slice if rel_idx is outside the current window. Returns a new + * slice, or the old pointer if the current slice can already accommodate the + * index. */ +arSlice *arSliceDenseGrowIfNeeded(redisArray *ar, arSlice *s, uint32_t rel_idx, uint32_t slice_size) { + if (rel_idx >= s->layout.dense.offset + s->layout.dense.winsize) { + return arSliceDenseGrowRight(ar, s, rel_idx, slice_size); + } else if (rel_idx < s->layout.dense.offset) { + return arSliceDenseGrowLeft(ar, s, rel_idx, slice_size); + } + return s; +} + +/* Binary search in sparse slice. + * Returns index where rel_idx is or should be (the two cases + * can be distinguished via 'found'). */ +uint32_t arSparseFindPos(arSlice *s, uint16_t rel_idx, int *found) { + uint16_t *offsets = s->layout.sparse.offsets; + uint32_t lo = 0, hi = s->count; + while (lo < hi) { + uint32_t mid = lo + (hi - lo) / 2; + if (offsets[mid] < rel_idx) { + lo = mid + 1; + } else { + hi = mid; + } + } + *found = (lo < s->count && offsets[lo] == rel_idx); + return lo; +} + +/* Promote sparse slice to dense. */ +arSlice *arSparsePromote(redisArray *ar, arSlice *s, uint32_t slice_size) { + if (s->count == 0) { + arFreeAndTrack(ar, s); + return arSliceDenseNew(ar, 0, slice_size); + } + + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + + uint32_t min_off = offsets[0]; + uint32_t max_off = offsets[s->count - 1]; + uint32_t need = max_off - min_off + 1; + + uint32_t winsize = nearestNextPowerOf2(need); + if (winsize < AR_SLICE_MIN_ALLOC) winsize = AR_SLICE_MIN_ALLOC; + + uint32_t offset = min_off; + if (winsize >= slice_size) { + winsize = slice_size; + offset = 0; + } else if (offset + winsize > slice_size) { + /* Window would exceed slice boundary, adjust offset */ + offset = slice_size - winsize; + } + + arSlice *d = arAllocAndTrack(ar, arDenseAllocSize(winsize)); + d->encoding = AR_SLICE_DENSE; + d->count = s->count; + d->layout.dense.offset = offset; + d->layout.dense.winsize = winsize; + d->layout.dense.max_idx = max_off; + d->layout.dense.items = (void **)(d + 1); + + /* Set the entries in the sparse representation into the + * new dense slice. */ + memset(d->layout.dense.items, 0, winsize * sizeof(void *)); + for (uint32_t i = 0; i < s->count; i++) { + serverAssert(offsets[i] >= offset); + serverAssert(offsets[i] - offset < winsize); + d->layout.dense.items[offsets[i] - offset] = values[i]; + } + + arFreeAndTrack(ar, s); + return d; +} + +/* Demote the provided dense slice to a sparse slice, if beneficial. + * The function returns the dense slice given in input if not demoted, + * otherwise the newly created sparse slice containing the same elements + * is returned, in this case, as a side effect, the dense slice in + * input is freed. */ +arSlice *arDenseMaybeDemote(redisArray *ar, arSlice *d) { + if (ArraySparseKMax == 0) return d; // Sparse is disabled by config. + if (d->count > ArraySparseKMin) return d; // Yet not at demotion level. + if (d->count > ArraySparseKMax) return d; // Just config sanity check. + if (d->layout.dense.winsize == AR_SLICE_MIN_ALLOC) return d; // Already small. + + /* Only demote if it actually saves memory. We require the dense slice + * to be significantly larger than sparse would be (at least 25% bigger), + * and large enough in absolute terms (4x kmin) to be worth the trouble. */ + size_t dense_bytes = arDenseAllocSize(d->layout.dense.winsize); + size_t sparse_bytes = arSparseAllocSize(ArraySparseKMin); + if (d->layout.dense.winsize < 4 * ArraySparseKMin) return d; + if (dense_bytes < sparse_bytes * 5 / 4) return d; + + /* Demote it. */ + arSlice *s = arAllocAndTrack(ar, arSparseAllocSize(ArraySparseKMin)); + s->encoding = AR_SLICE_SPARSE; + s->count = 0; + s->layout.sparse.cap = ArraySparseKMin; + arSparseSetupPointers(s); + + /* Copy every entry from dense to sparse. */ + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + for (uint32_t i = 0; i < d->layout.dense.winsize && s->count < d->count; i++) { + if (!arIsEmpty(d->layout.dense.items[i])) { + offsets[s->count] = d->layout.dense.offset + i; + values[s->count] = d->layout.dense.items[i]; + s->count++; + } + } + + arFreeAndTrack(ar, d); + return s; +} + +/* Update max_idx after deletion in dense slice. This is O(winsize) in the worst + * case, but we only scan when we deleted the current max, which is rare. */ +void arDenseUpdateMaxIdx(arSlice *d, uint32_t deleted_off) { + /* Note that if the slice is left without elements, it will get + * deallocated so there is nothing to set. */ + if (d->count == 0 || deleted_off < d->layout.dense.max_idx) return; + + /* Scan backward from old max to find new max. */ + for (int pos = d->layout.dense.max_idx - d->layout.dense.offset; pos >= 0; pos--) { + if (!arIsEmpty(d->layout.dense.items[pos])) { + d->layout.dense.max_idx = d->layout.dense.offset + pos; + return; + } + } +} + +/* ---------------------------------------------------------------------------- + * Directory management (flat mode and superdir mode) + * + * Why two modes: + * + * - Flat mode (ar->superdir == NULL): ar->dir is indexed by slice_id + * (ar->dir[slice_id] -> arSlice*). This is very fast and compact while + * slice IDs stay relatively low. + * + * - Superdir mode (ar->superdir != NULL): there are two levels of indirection. + * Metadata (that is, pointers to actual array slices) is split into sorted + * entries by block_id; each block is a fixed table of 2048 slice pointers. + * That table uses about 8 KB on 32-bit builds and 16 KB on 64-bit builds. + * Blocks are allocated on demand. Basically this means that what was, in + * flat mode, a contiguous array of slice pointers (called the directory), + * in superdir mode becomes a sparse array of directory pieces. + * + * The superdir avoids catastrophic metadata growth for sparse/high indices. + * A flat directory must be sized up to the highest slice_id, even if almost + * all entries are NULL. With very large index jumps, that would waste huge + * memory. Superdir keeps metadata proportional to the number of populated + * blocks/slices instead of the largest slice_id ever seen. + * + * Promotion trigger: + * - When slice_id >= AR_SUPER_BLOCK_SLOTS (2048), flat mode is promoted. + * - Practical meaning: slice_id is idx / slice_size. + * With default slice_size=4096, threshold slice_id=2048 corresponds to + * idx >= 2048*4096 = 8,388,608 (first index that needs block_id 1). + * + * Hint: here what we gain is not just efficiency. Also there are no security + * concerns with setting a very high index. No problem with a corrupted RDB + * file containing a very high index, and no need to configure a maximum index + * allowable in an array. Thanks to this design the array type of Redis is + * a more useful and safe type. + * -------------------------------------------------------------------------- */ + +/* Binary search for block_id in superdir. + * Returns index where found or should be inserted. */ +uint32_t arSuperDirFind(redisArray *ar, uint64_t block_id, int *found) { + uint32_t lo = 0, hi = ar->sdir_len; + while (lo < hi) { + uint32_t mid = lo + (hi - lo) / 2; + if (ar->superdir[mid].block_id < block_id) { + lo = mid + 1; + } else { + hi = mid; + } + } + *found = (lo < ar->sdir_len && ar->superdir[lo].block_id == block_id); + return lo; +} + +/* Get slice pointer from superdir mode. Returns NULL if not found. */ +arSlice **arSuperDirGetSlot(redisArray *ar, uint64_t slice_id) { + uint64_t block_id = slice_id / AR_SUPER_BLOCK_SLOTS; + uint32_t block_off = slice_id % AR_SUPER_BLOCK_SLOTS; + + int found; + uint32_t pos = arSuperDirFind(ar, block_id, &found); + if (!found) return NULL; + + return ar->superdir[pos].slots + block_off; +} + +/* Ensure block exists in superdir, creating if needed. Returns slot pointer. */ +arSlice **arSuperDirEnsureSlot(redisArray *ar, uint64_t slice_id) { + uint64_t block_id = slice_id / AR_SUPER_BLOCK_SLOTS; + uint32_t block_off = slice_id % AR_SUPER_BLOCK_SLOTS; + + int found; + uint32_t pos = arSuperDirFind(ar, block_id, &found); + + if (!found) { + /* Need to insert new block */ + if (ar->sdir_len >= ar->sdir_cap) { + /* Grow superdir array */ + uint32_t new_cap = ar->sdir_cap ? ar->sdir_cap * 2 : 4; + ar->superdir = arReallocAndTrack(ar, ar->superdir, new_cap * sizeof(arSDirEntry)); + ar->sdir_cap = new_cap; + } + + /* Shift entries to make room */ + if (pos < ar->sdir_len) { + memmove(ar->superdir + pos + 1, ar->superdir + pos, + (ar->sdir_len - pos) * sizeof(arSDirEntry)); + } + + /* Initialize new entry */ + ar->superdir[pos].block_id = block_id; + ar->superdir[pos].count = 0; + ar->superdir[pos].slots = arCallocAndTrack(ar, AR_SUPER_BLOCK_SLOTS * sizeof(arSlice *)); + ar->sdir_len++; + } + + return ar->superdir[pos].slots + block_off; +} + +/* Look up the superdir block that contains slice_id. + * Returns a pointer to that arSDirEntry, or NULL if the block was never + * allocated (no slices currently exist in that block). */ +arSDirEntry *arSuperDirGetEntry(redisArray *ar, uint64_t slice_id) { + uint64_t block_id = slice_id / AR_SUPER_BLOCK_SLOTS; + int found; + uint32_t pos = arSuperDirFind(ar, block_id, &found); + return found ? ar->superdir + pos : NULL; +} + +/* Remove one block entry from superdir at index pos. + * We assume 'pos' is valid and the block is logically empty (count == 0). + * Frees the slice-pointer table, compacts remaining entries (keeping order by + * block_id), and decrements ar->sdir_len. */ +void arSuperDirRemoveBlock(redisArray *ar, uint32_t pos) { + arFreeAndTrack(ar, ar->superdir[pos].slots); + if (pos < ar->sdir_len - 1) { + memmove(ar->superdir + pos, ar->superdir + pos + 1, + (ar->sdir_len - pos - 1) * sizeof(arSDirEntry)); + } + ar->sdir_len--; +} + +/* Promote from flat directory to superdir mode. Flat mode only ever uses + * slice_id < AR_SUPER_BLOCK_SLOTS, so all existing slices belong to block 0. */ +void arPromoteToSuperDir(redisArray *ar) { + ar->sdir_cap = 4; + ar->sdir_len = 0; + ar->superdir = arAllocAndTrack(ar, ar->sdir_cap * sizeof(arSDirEntry)); + + /* Copy existing flat dir content into block 0 */ + if (ar->dir_alloc > 0) { + ar->superdir[0].block_id = 0; + ar->superdir[0].slots = arCallocAndTrack(ar, AR_SUPER_BLOCK_SLOTS * sizeof(arSlice *)); + ar->superdir[0].count = 0; + ar->sdir_len = 1; + + /* Copy flat dir pointers to block 0, counting non-NULL */ + for (uint64_t i = 0; i < ar->dir_alloc; i++) { + ar->superdir[0].slots[i] = ar->dir[i]; + if (ar->dir[i]) ar->superdir[0].count++; + } + } + + /* Free old flat directory */ + if (ar->dir) arFreeAndTrack(ar, ar->dir); + ar->dir = NULL; + ar->dir_alloc = 0; +} + +/* Grow directory to accommodate slice_id (handles both modes, dense and + * superdir mode). */ +void arDirGrow(redisArray *ar, uint64_t slice_id) { + /* Check if promotion to superdir is needed */ + if (ar->superdir == NULL && slice_id >= AR_SUPER_BLOCK_SLOTS) { + arPromoteToSuperDir(ar); + } + + if (ar->superdir) { + /* Superdir allocates blocks on-demand in arSetSlice(), so we don't + * allocate a 2048-pointer block for ranges that end up empty. */ + return; + } + + /* Flat mode: grow directory if needed */ + if (slice_id < ar->dir_alloc) return; + + uint64_t new_alloc = ar->dir_alloc ? ar->dir_alloc : 1; + + /* Grow geometrically and stop at the first power-of-two size + * that can index slice_id. Note that thanks to superdir mode the + * size of this table of pointers is bound. */ + while (new_alloc <= slice_id) { + new_alloc <<= 1; + } + + arSlice **new_dir = arReallocAndTrack(ar, ar->dir, new_alloc * sizeof(arSlice *)); + + /* Zero-fill new slots */ + memset(new_dir + ar->dir_alloc, 0, (new_alloc - ar->dir_alloc) * sizeof(arSlice *)); + ar->dir = new_dir; + ar->dir_alloc = new_alloc; +} + +/* Maybe shrink directory after freeing a slice (flat mode only). + * Since dir_alloc is always a power of two, we can only shrink by halving. + * So shrinking only happens when dir_highest_used < dir_alloc/2. The 90% + * check is just a quick early-out to skip the loop in the common case. */ +void arDirMaybeShrink(redisArray *ar) { + if (ar->superdir) return; /* Superdir mode: blocks freed individually */ + if (ar->count == 0) return; /* Will be deleted anyway */ + if (ar->dir_highest_used >= ar->dir_alloc * 9 / 10) return; + + /* Find smallest power of two > dir_highest_used */ + uint64_t new_alloc = 1; + while (new_alloc <= ar->dir_highest_used) new_alloc <<= 1; + if (new_alloc >= ar->dir_alloc) return; + + ar->dir = arReallocAndTrack(ar, ar->dir, new_alloc * sizeof(arSlice *)); + ar->dir_alloc = new_alloc; +} + +/* Update dir_highest_used after freeing a slice. + * To always know the highest directory index used is useful + * for a number of reasons: + * 1. arLen() is O(1) this way. + * 2. We can start reverse scans from the rightmost populated directory entry. + * 3. We can shrink the directory (in flat mode) if needed, since we know + * the usage. */ +void arDirUpdateHighest(redisArray *ar, uint64_t freed_id) { + if (ar->count == 0) return; + if (freed_id < ar->dir_highest_used) return; + + if (ar->superdir) { + /* Superdir mode: scan backwards through blocks */ + for (int32_t bi = ar->sdir_len - 1; bi >= 0; bi--) { + arSDirEntry *e = ar->superdir + bi; + if (e->count == 0) continue; + /* Scan backwards through this block's slots */ + for (int32_t si = AR_SUPER_BLOCK_SLOTS - 1; si >= 0; si--) { + if (e->slots[si] != NULL) { + ar->dir_highest_used = e->block_id * AR_SUPER_BLOCK_SLOTS + si; + return; + } + } + } + ar->dir_highest_used = 0; + } else { + /* Flat mode: scan backward for next non-NULL slice */ + for (int64_t i = (int64_t)freed_id - 1; i >= 0; i--) { + if (ar->dir[i] != NULL) { + ar->dir_highest_used = i; + return; + } + } + ar->dir_highest_used = 0; + } +} + +/* Get slice pointer by slice_id (which is the logical array-index divided by + * the elements-per-slice), handling both flat and superdir modes. If no slice + * was already allocated for such slice_id, NULL is returned. */ +arSlice *arGetSlice(redisArray *ar, uint64_t slice_id) { + if (ar->superdir) { + arSlice **slot = arSuperDirGetSlot(ar, slice_id); + return slot ? *slot : NULL; + } else { + if (slice_id >= ar->dir_alloc) return NULL; + return ar->dir[slice_id]; + } +} + +/* Set slice pointer in the directory. In superdir mode, setting to NULL + * decrements the block's slice count and frees the block if it becomes empty. + * Setting to non-NULL allocates the block if needed. */ +void arSetSlice(redisArray *ar, uint64_t slice_id, arSlice *s) { + if (ar->superdir) { + uint64_t block_id = slice_id / AR_SUPER_BLOCK_SLOTS; + uint32_t block_off = slice_id % AR_SUPER_BLOCK_SLOTS; + + int found; + uint32_t pos = arSuperDirFind(ar, block_id, &found); + + if (s == NULL) { + /* Setting to NULL: decrement block count, maybe remove block */ + if (!found) return; /* Block doesn't exist, nothing to do */ + arSDirEntry *entry = ar->superdir + pos; + if (entry->slots[block_off] != NULL) { + entry->slots[block_off] = NULL; + entry->count--; + ar->num_slices--; + /* Remove empty block */ + if (entry->count == 0) { + arSuperDirRemoveBlock(ar, pos); + } + } + } else { + /* Setting to non-NULL: ensure block exists */ + arSlice **slot = arSuperDirEnsureSlot(ar, slice_id); + arSDirEntry *entry = arSuperDirGetEntry(ar, slice_id); + if (*slot == NULL) { + entry->count++; + ar->num_slices++; + } + *slot = s; + } + } else { + if (s == NULL && ar->dir[slice_id] != NULL) ar->num_slices--; + else if (s != NULL && ar->dir[slice_id] == NULL) ar->num_slices++; + ar->dir[slice_id] = s; + } +} + +/* ---------------------------------------------------------------------------- + * Value encoding + * -------------------------------------------------------------------------- */ + +/* Try to encode string as immediate integer */ +int arTryEncodeInt(const char *s, size_t len, void **out) { + long long ll; + if (string2ll(s, len, &ll) && arIntFits(ll)) { + *out = arFromInt(ll); + return 1; + } + return 0; +} + +/* Try to encode string as immediate float. + * + * The local immediate float encoding clears the low 2 bits of the underlying + * floating-point payload to make room for the tag. On 64-bit builds we do it + * on the IEEE-754 double bits directly. On 32-bit builds we first quantize to + * float, then clear the low 2 bits of the float payload. We only encode if the + * later string representation matches the original input exactly. + * + * There's a subtlety with whole-number floats: d2string formats 1.0 as "1" + * (without decimal point), so "1.0" wouldn't match and would be stored as + * a heap string. We fix this by appending ".0" when d2string produces an + * integer-looking result and comparing again. + * + * Note: pure integers like "1" are handled by arTryEncodeInt first, so values + * reaching here that look like integers after d2string likely had ".0". */ +int arTryEncodeFloat(const char *s, size_t len, void **out) { + /* Fast filter to discard things that obviously can't pass the later + * round-trip test: + * + * 1. Can have optional leading '-'. + * 2. Can be composed only by digits plus one mandatory '.'. + * + * This skips expensive float parsing for obvious non-candidates. */ + size_t i = 0; + int dot_seen = 0; + + if (len == 0) return 0; + if (s[0] == '-') { + if (len == 1) return 0; + i = 1; + } + for (; i < len; i++) { + char c = s[i]; + if (c == '.') { + if (dot_seen) return 0; + dot_seen = 1; + } else if (c < '0' || c > '9') { + return 0; + } + } + if (!dot_seen) return 0; + + /* Expensive round-trip path: convert to double. */ + double d; + if (!string2d(s, len, &d)) return 0; + if (isnan(d) || isinf(d)) return 0; + + uint64_t bits_trunc; + double d_trunc; +#if UINTPTR_MAX == UINT64_MAX + /* Truncate the double payload directly on 64-bit builds. */ + uint64_t bits; + memcpy(&bits, &d, sizeof(bits)); + bits_trunc = bits & ~AR_TAG_MASK; + memcpy(&d_trunc, &bits_trunc, sizeof(d_trunc)); +#else + /* 32-bit builds inline floats, not doubles. Quantize first, then clear + * the low 2 bits of the float payload. */ + float f = (float)d; + if (!isfinite(f)) return 0; // May happen after casting. + uint32_t bits32; + uint32_t bits32_trunc; + float f_trunc; + + memcpy(&bits32, &f, sizeof(bits32)); + bits32_trunc = bits32 & ~(uint32_t)AR_TAG_MASK; + memcpy(&f_trunc, &bits32_trunc, sizeof(f_trunc)); + bits_trunc = bits32_trunc; + d_trunc = (double)f_trunc; // Reduced precision float here. +#endif + + /* Verify round-trip */ + char buf[AR_INLINE_BUFSIZE]; + int buflen = d2string(buf, sizeof(buf) - 2, d_trunc); + if ((size_t)buflen == len && memcmp(buf, s, len) == 0) { + *out = arFromFloatBits(bits_trunc); + return 1; + } + + /* Also try the ".0" form. d2string(1.0) returns "1", but when floats are + * later converted back to strings we restore ".0" for integer-looking + * values, so inputs like "1.0" can still round-trip exactly. */ + buf[buflen] = '.'; + buf[buflen + 1] = '0'; + buf[buflen + 2] = '\0'; + buflen += 2; + if ((size_t)buflen == len && memcmp(buf, s, len) == 0) { + *out = arFromFloatBits(bits_trunc); + return 1; + } + + return 0; +} + +/* Format a float in the canonical string form exposed by arrays. + * buf must be at least AR_INLINE_BUFSIZE bytes. We use d2string() for the + * shortest round-trippable representation, then restore ".0" for + * integer-looking finite values so decoded/replied floats match the logical + * form expected by array persistence and encoding checks. */ +int arFormatFloat(double d, char *buf, size_t bufsize) { + serverAssert(bufsize >= AR_INLINE_BUFSIZE); + int len = d2string(buf, bufsize - 2, d); + if (isfinite(d) && !memchr(buf, '.', len) && !memchr(buf, 'e', len) && + !memchr(buf, 'E', len)) { + serverAssert((size_t)len + 2 < bufsize); + buf[len++] = '.'; + buf[len++] = '0'; + buf[len] = '\0'; + } + return len; +} + +/* Encode a string into a tagged value */ +void *arEncode(const char *s, size_t len) { + void *v; + + /* Try integer first */ + if (arTryEncodeInt(s, len, &v)) { + return v; + } + + /* Try float */ + if (arTryEncodeFloat(s, len, &v)) { + return v; + } + + /* Try small string (architecture-dependent inline limit). */ + if (len <= AR_SMALLSTR_MAXLEN) { + return arFromSmallStr(s, (int)len); + } + + /* Fall back to arString (8+ bytes) */ + return arStringNew(s, len); +} + +void *arValueFromRdbInt(int64_t ival) { + if (arIntFits(ival)) return arFromInt(ival); + + /* If the integer does not fit (i.e. loading into a 32 bit instance + * what was stored in the RDB by a 64 bit instance), we promote it + * to a plain string. */ + char buf[32]; + int len = ll2string(buf, sizeof(buf), ival); + return arStringNew(buf, len); +} + +void *arValueFromRdbFloat(double d) { +#if UINTPTR_MAX == UINT64_MAX + /* On 64-bit, doubles are inlined directly (low 2 bits cleared). + * No string round-trip needed: the RDB double already has clean + * low bits (from the saving side's arToDouble). */ + uint64_t bits; + memcpy(&bits, &d, sizeof(bits)); + return arFromFloatBits(bits); +#endif + + /* Loading on a 32 bit system is more complicated to do efficiently. + * + * RDB always stores array floats as doubles. On 32-bit systems we can + * only inline a float payload with the low 2 bits stolen for the tag. + * Simulate that exact quantization path and keep the value encoded only + * if it survives unchanged. */ + uint32_t bits32; + uint32_t bits32_trunc; + float f_trunc; + double d_trunc; + + /* Narrow to float first, then clear the low 2 payload bits that are + * reserved for the tagged-pointer type. */ + float f = (float)d; + memcpy(&bits32, &f, sizeof(bits32)); + bits32_trunc = bits32 & ~(uint32_t)AR_TAG_MASK; + memcpy(&f_trunc, &bits32_trunc, sizeof(f_trunc)); + d_trunc = (double)f_trunc; + + /* Bitwise comparison keeps signed zero distinct and tells us whether + * the 64-bit RDB value is exactly representable by the local 30-bit + * inline-float format. */ + uint64_t bits64; + uint64_t bits64_trunc; + memcpy(&bits64, &d, sizeof(bits64)); + memcpy(&bits64_trunc, &d_trunc, sizeof(bits64_trunc)); + if (bits64 == bits64_trunc) return arFromFloatBits(bits32_trunc); + + /* Otherwise materialize the canonical string form for this float. */ + char buf[AR_INLINE_BUFSIZE]; + int len = arFormatFloat(d, buf, sizeof(buf)); + return arStringNew(buf, len); +} + +void *arValueFromRdbSmallStr(const char *s, size_t len) { + if (len <= AR_SMALLSTR_MAXLEN) return arFromSmallStr(s, (int)len); + return arStringNew(s, len); +} + +/* Decode a tagged value into raw bytes. + * For inline values, buf must point to at least AR_INLINE_BUFSIZE bytes and + * the returned pointer will be buf. For arString values, the returned pointer + * aliases the string payload directly. Returns NULL if value is empty. + * + * This is a helper function used for AOF rewriting, AROP string "MATCH" + * and DEBUG DIGEST. */ +const char *arDecode(void *v, char *buf, size_t bufsize, size_t *outlen) { + serverAssert(bufsize >= AR_INLINE_BUFSIZE); + if (arIsEmpty(v)) { + if (outlen) *outlen = 0; + return NULL; + } + + if (arIsInt(v)) { + int64_t ival = arToInt(v); + int len = ll2string(buf, 32, ival); + if (outlen) *outlen = len; + return buf; + } + + if (arIsFloat(v)) { + double d = arToDouble(v); + int len = arFormatFloat(d, buf, bufsize); + if (outlen) *outlen = len; + return buf; + } + + if (arIsSmallStr(v)) { + int len = arSmallStrLen(v); + if (outlen) *outlen = len; + arToSmallStr(v, buf); + return buf; + } + + /* arString pointer */ + size_t len = arStringLen(v); + if (outlen) *outlen = len; + return arStringData(v); +} + + +/* ---------------------------------------------------------------------------- + * Array lifecycle + * -------------------------------------------------------------------------- */ + +/* Create a new empty array */ +redisArray *arNew(void) { + redisArray *ar = zmalloc(sizeof(redisArray)); + ar->count = 0; + ar->insert_idx = AR_INSERT_IDX_NONE; + ar->dir_alloc = 0; + ar->dir_highest_used = 0; + ar->num_slices = 0; + ar->alloc_size = zmalloc_size(ar); + ar->slice_size = ArraySliceSize; /* Use current config value */ + ar->sdir_len = 0; + ar->sdir_cap = 0; + ar->dir = NULL; + ar->superdir = NULL; + return ar; +} + +/* Free an array and all its contents */ +void arFree(redisArray *ar) { + if (!ar) return; + + if (ar->superdir) { + /* Superdir mode: free all blocks and their slices */ + for (uint32_t i = 0; i < ar->sdir_len; i++) { + arSDirEntry *e = ar->superdir + i; + for (uint32_t j = 0; j < AR_SUPER_BLOCK_SLOTS; j++) { + if (e->slots[j]) arSliceFree(NULL, e->slots[j]); + } + zfree(e->slots); + } + zfree(ar->superdir); + } else { + /* Flat mode */ + for (uint64_t i = 0; i < ar->dir_alloc; i++) { + if (ar->dir[i]) { + arSliceFree(NULL, ar->dir[i]); + } + } + zfree(ar->dir); + } + zfree(ar); +} + +/* Dismiss a single slice's memory back to the OS. */ +static void arSliceDismiss(arSlice *s, int dismiss_values) { + if (s->encoding == AR_SLICE_DENSE) { + if (dismiss_values) { + void **items = s->layout.dense.items; + for (uint32_t i = 0; i < s->layout.dense.winsize; i++) { + if (arIsPtr(items[i])) + dismissMemory(items[i], arStringLen(items[i])); + } + } + dismissMemory(s, arDenseAllocSize(s->layout.dense.winsize)); + } else { + if (dismiss_values) { + void **values = s->layout.sparse.values; + for (uint32_t i = 0; i < s->count; i++) { + if (arIsPtr(values[i])) + dismissMemory(values[i], arStringLen(values[i])); + } + } + dismissMemory(s, arSparseAllocSize(s->layout.sparse.cap)); + } +} + +/* See dismissObject(). Always dismiss the directory and slices; per-value + * dismissal only when the average element size makes it worthwhile. */ +void arDismiss(redisArray *ar, size_t size_hint) { + if (!ar) return; + uint64_t count = ar->count; + int dismiss_values = (count != 0 && size_hint / count >= server.page_size); + + if (ar->superdir) { + for (uint32_t bi = 0; bi < ar->sdir_len; bi++) { + arSDirEntry *e = ar->superdir + bi; + for (uint32_t si = 0; si < AR_SUPER_BLOCK_SLOTS; si++) { + if (e->slots[si] == NULL) continue; + arSliceDismiss(e->slots[si], dismiss_values); + } + dismissMemory(e->slots, AR_SUPER_BLOCK_SLOTS * sizeof(arSlice *)); + } + dismissMemory(ar->superdir, ar->sdir_cap * sizeof(arSDirEntry)); + } else if (ar->dir) { + for (uint64_t i = 0; i < ar->dir_alloc; i++) { + if (ar->dir[i] == NULL) continue; + arSliceDismiss(ar->dir[i], dismiss_values); + } + dismissMemory(ar->dir, ar->dir_alloc * sizeof(arSlice *)); + } +} + +/* arDup() helper to duplicate a single slice into the duplicated array. + * This function is responsible of tracking allocations in dup_ar + * (hence the name of the parameter), since it has the knowledge of + * the array slice that it is duplicating. + * + * The dear reader of this code may wonder why we don't just duplicate the + * array and its slices without tracking memory, and then copy the memory + * field of the array at the end. The problem is that the array does not + * track the logical allocated memory, but the actual memory usage reported + * by the allocator: there is no guarantee that the allocations of the copy + * perfectly match the ones of the original array. */ +arSlice *arSliceDup(redisArray *dup_ar, arSlice *s) { + if (s->encoding == AR_SLICE_DENSE) { + size_t sz = arDenseAllocSize(s->layout.dense.winsize); + arSlice *nd = arAllocAndTrack(dup_ar, sz); + memcpy(nd, s, sizeof(arSlice)); + nd->layout.dense.items = (void **)(nd + 1); + memcpy(nd->layout.dense.items, s->layout.dense.items, + s->layout.dense.winsize * sizeof(void *)); + + /* Duplicate arString pointers */ + for (uint32_t j = 0; j < s->layout.dense.winsize; j++) { + if (arIsPtr(nd->layout.dense.items[j])) { + nd->layout.dense.items[j] = arStringDup(nd->layout.dense.items[j]); + arTrackValueIn(dup_ar, nd->layout.dense.items[j]); + } + } + return nd; + } else { + size_t sz = arSparseAllocSize(s->layout.sparse.cap); + arSlice *nsp = arAllocAndTrack(dup_ar, sz); + memcpy(nsp, s, sizeof(arSlice)); + arSparseSetupPointers(nsp); + memcpy(nsp->layout.sparse.offsets, s->layout.sparse.offsets, + s->layout.sparse.cap * sizeof(uint16_t)); + memcpy(nsp->layout.sparse.values, s->layout.sparse.values, + s->layout.sparse.cap * sizeof(void *)); + + /* Duplicate arString pointers */ + void **values = nsp->layout.sparse.values; + for (uint32_t j = 0; j < s->count; j++) { + if (arIsPtr(values[j])) { + values[j] = arStringDup(values[j]); + arTrackValueIn(dup_ar, values[j]); + } + } + return nsp; + } +} + +/* Duplicate an array (deep copy) */ +redisArray *arDup(redisArray *ar) { + redisArray *dup = zmalloc(sizeof(redisArray)); + dup->count = ar->count; + dup->insert_idx = ar->insert_idx; + dup->dir_alloc = ar->dir_alloc; + dup->dir_highest_used = ar->dir_highest_used; + dup->num_slices = ar->num_slices; + dup->alloc_size = zmalloc_size(dup); + dup->slice_size = ar->slice_size; + dup->sdir_len = ar->sdir_len; + dup->sdir_cap = ar->sdir_cap; + + if (ar->superdir) { + /* Superdir mode */ + dup->dir = NULL; + dup->superdir = arAllocAndTrack(dup, ar->sdir_cap * sizeof(arSDirEntry)); + + for (uint32_t i = 0; i < ar->sdir_len; i++) { + arSDirEntry *src = ar->superdir + i; + arSDirEntry *dst = dup->superdir + i; + + dst->block_id = src->block_id; + dst->count = src->count; + dst->slots = arCallocAndTrack(dup, AR_SUPER_BLOCK_SLOTS * sizeof(arSlice *)); + + for (uint32_t j = 0; j < AR_SUPER_BLOCK_SLOTS; j++) { + if (src->slots[j]) { + dst->slots[j] = arSliceDup(dup, src->slots[j]); + } + } + } + } else if (ar->dir_alloc > 0) { + /* Flat mode */ + dup->superdir = NULL; + dup->dir = arAllocAndTrack(dup, ar->dir_alloc * sizeof(arSlice *)); + memset(dup->dir, 0, ar->dir_alloc * sizeof(arSlice *)); + + for (uint64_t i = 0; i < ar->dir_alloc; i++) { + if (ar->dir[i]) { + dup->dir[i] = arSliceDup(dup, ar->dir[i]); + } + } + } else { + dup->dir = NULL; + dup->superdir = NULL; + } + + return dup; +} + +/* ---------------------------------------------------------------------------- + * Core operations + * -------------------------------------------------------------------------- */ + +/* Get value at index (returns NULL for empty/missing) */ +void *arGet(redisArray *ar, uint64_t idx) { + uint64_t slice_id = arSliceId(idx, ar->slice_size); + uint32_t rel_idx = arSliceOff(idx, ar->slice_size); + + arSlice *s = arGetSlice(ar, slice_id); + if (s == NULL) return NULL; // No slice at all for this index. + + if (s->encoding == AR_SLICE_DENSE) { + if (rel_idx < s->layout.dense.offset || + rel_idx >= s->layout.dense.offset + s->layout.dense.winsize) + { + // The slice window does not include this index. + return NULL; + } + return s->layout.dense.items[rel_idx - s->layout.dense.offset]; + } else { + int found; + uint32_t pos = arSparseFindPos(s, (uint16_t)rel_idx, &found); + if (found) { + void **values = s->layout.sparse.values; + return values[pos]; + } + return NULL; + } +} + +/* Set value at index. Caller must ensure idx != UINT64_MAX. + * v must not be NULL (empty) - use arDel() to delete elements. */ +void arSet(redisArray *ar, uint64_t idx, void *v) { + serverAssert(v != NULL); /* Use arDel for deletion, not arSet(v=NULL) */ + /* UINT64_MAX can't be used for a couple of reasons: for once, + * the array len is the max index set + 1, so we could not represent + * that; also it is a sentinel for last set index still not being set. */ + serverAssert(idx != UINT64_MAX); + uint64_t slice_id = arSliceId(idx, ar->slice_size); + uint32_t rel_idx = arSliceOff(idx, ar->slice_size); + + /* Ensure directory capacity (may trigger promotion to superdir) */ + arDirGrow(ar, slice_id); + + /* Get current slice */ + arSlice *s = arGetSlice(ar, slice_id); + + /* Create slice if missing */ + if (s == NULL) { + if (ArraySparseKMax > 0) { + s = arSliceSparseNew(ar); + } else { + s = arSliceDenseNew(ar, rel_idx, ar->slice_size); + } + arSetSlice(ar, slice_id, s); + } + + if (s->encoding == AR_SLICE_DENSE) { + /* Grow the slice window if needed */ + s = arSliceDenseGrowIfNeeded(ar, s, rel_idx, ar->slice_size); + arSetSlice(ar, slice_id, s); // In case it changed. + + uint32_t pos = rel_idx - s->layout.dense.offset; + void *old = s->layout.dense.items[pos]; + + if (arIsEmpty(old)) { + s->count++; + ar->count++; + } else { + /* Replace existing value. */ + arTrackValueOut(ar, old); + arFreePtr(old); + } + + arTrackValueIn(ar, v); + s->layout.dense.items[pos] = v; + + /* Update max_idx */ + if (rel_idx > s->layout.dense.max_idx) { + s->layout.dense.max_idx = rel_idx; + } + } else { + int found; + uint32_t pos = arSparseFindPos(s, (uint16_t)rel_idx, &found); + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + + if (found) { + /* Replace existing */ + arTrackValueOut(ar, values[pos]); + arFreePtr(values[pos]); + arTrackValueIn(ar, v); + values[pos] = v; + } else { + /* Insert new */ + if (s->count >= ArraySparseKMax) { + /* Promote to dense */ + arSlice *d = arSparsePromote(ar, s, ar->slice_size); + + /* Grow window if needed */ + d = arSliceDenseGrowIfNeeded(ar, d, rel_idx, ar->slice_size); + arSetSlice(ar, slice_id, d); + + uint32_t dpos = rel_idx - d->layout.dense.offset; + arTrackValueIn(ar, v); + d->layout.dense.items[dpos] = v; + d->count++; + ar->count++; + if (rel_idx > d->layout.dense.max_idx) d->layout.dense.max_idx = rel_idx; + } else { + /* Insert in sparse */ + if (s->count >= s->layout.sparse.cap) { + /* Grow capacity, we grow 2x but note that there is no + * point in growing more than kmax, so we clamp to kmax. */ + uint32_t new_cap = s->layout.sparse.cap * 2; + if (new_cap > ArraySparseKMax) new_cap = ArraySparseKMax; + arSlice *ns = arAllocAndTrack(ar, arSparseAllocSize(new_cap)); + ns->encoding = AR_SLICE_SPARSE; + ns->count = s->count; + ns->layout.sparse.cap = new_cap; + arSparseSetupPointers(ns); + + /* Copy old data to new slice */ + uint16_t *old_offsets = s->layout.sparse.offsets; + void **old_values = s->layout.sparse.values; + uint16_t *new_offsets = ns->layout.sparse.offsets; + void **new_values = ns->layout.sparse.values; + memcpy(new_offsets,old_offsets,s->count * sizeof(uint16_t)); + memcpy(new_values,old_values,s->count * sizeof(void *)); + + arFreeAndTrack(ar, s); + s = ns; + arSetSlice(ar, slice_id, s); + offsets = new_offsets; + values = new_values; + } + + /* Shift and insert in place */ + memmove(offsets + pos + 1, offsets + pos, + (s->count - pos) * sizeof(uint16_t)); + memmove(values + pos + 1, values + pos, + (s->count - pos) * sizeof(void *)); + offsets[pos] = (uint16_t)rel_idx; + arTrackValueIn(ar, v); + values[pos] = v; + s->count++; + ar->count++; + } + } + } + + /* Update dir_highest_used. The count==1 check handles when we just added + * the first element to an empty array. */ + if (slice_id > ar->dir_highest_used || ar->count == 1) { + ar->dir_highest_used = slice_id; + } +} + +/* Delete value at index. Returns 1 if deleted, 0 if was already empty. */ +int arDel(redisArray *ar, uint64_t idx) { + uint64_t slice_id = arSliceId(idx, ar->slice_size); + uint32_t rel_idx = arSliceOff(idx, ar->slice_size); + + arSlice *s = arGetSlice(ar, slice_id); + if (s == NULL) return 0; + + if (s->encoding == AR_SLICE_DENSE) { + if (rel_idx < s->layout.dense.offset || rel_idx >= s->layout.dense.offset + s->layout.dense.winsize) { + return 0; + } + + uint32_t pos = rel_idx - s->layout.dense.offset; + void *old = s->layout.dense.items[pos]; + if (arIsEmpty(old)) return 0; + + arTrackValueOut(ar, old); + arFreePtr(old); + s->layout.dense.items[pos] = NULL; + s->count--; + ar->count--; + + /* Update max_idx if we deleted the max */ + arDenseUpdateMaxIdx(s, rel_idx); + if (s->count != 0) { + /* Maybe demote to sparse. */ + arSetSlice(ar, slice_id, arDenseMaybeDemote(ar, s)); + return 1; + } + } else { + int found; + uint32_t pos = arSparseFindPos(s, (uint16_t)rel_idx, &found); + if (!found) return 0; + + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + + arTrackValueOut(ar, values[pos]); + arFreePtr(values[pos]); + memmove(offsets + pos, offsets + pos + 1, + (s->count - pos - 1) * sizeof(uint16_t)); + memmove(values + pos, values + pos + 1, + (s->count - pos - 1) * sizeof(void *)); + s->count--; + ar->count--; + } + + /* Delete the slice if now it is empty. */ + if (s->count == 0) { + arSliceFree(ar, s); + /* Note that in superdir mode arSetSlice() will also free + * empty blocks. */ + arSetSlice(ar, slice_id, NULL); + arDirUpdateHighest(ar, slice_id); + arDirMaybeShrink(ar); + } + return 1; +} + +/* ============================================================================ + * GENERALIZED RANGE DELETE - arDeleteRange + * ============================================================================ + * + * This function provides O(N) range deletion where N is the number of stored + * elements, NOT the numeric range length. It achieves this by: + * + * 1. Deleting whole fully-covered slices in the middle range. + * 2. In superdir mode, visiting only overlapping blocks and covered slice + * slots within them, instead of scanning the numeric slice-id span. + * 3. Only doing element-level deletion inside the two boundary slices. + * + * This is used by ARDELRANGE directly and by arTruncate as a special case. + * -------------------------------------------------------------------------- */ + +/* Helper: delete elements within a single slice in offset range + * [del_lo..del_hi]. Returns number of elements deleted. Handles both dense + * and sparse slices. + * + * Dense slices delete slot-by-slot inside the window. Sparse slices identify + * the contiguous offset span to delete, free those values, then compact the + * tail once. + * + * If the slice becomes empty, it is freed and the slot is cleared. */ +uint64_t arDeleteSliceRange(redisArray *ar, uint64_t slice_id, + uint32_t del_lo, uint32_t del_hi) { + arSlice *s = arGetSlice(ar, slice_id); + if (!s) return 0; + + uint64_t deleted = 0; + + if (s->encoding == AR_SLICE_DENSE) { + /* Dense: intersect deletion range with allocated window */ + uint32_t win_lo = s->layout.dense.offset; + uint32_t win_hi = s->layout.dense.offset + s->layout.dense.winsize - 1; + + /* Clamp to window */ + uint32_t eff_lo = (del_lo > win_lo) ? del_lo : win_lo; + uint32_t eff_hi = (del_hi < win_hi) ? del_hi : win_hi; + + if (eff_lo <= eff_hi) { + /* Clear every populated slot in the effective dense range. */ + for (uint32_t off = eff_lo; off <= eff_hi; off++) { + uint32_t pos = off - s->layout.dense.offset; + if (!arIsEmpty(s->layout.dense.items[pos])) { + arTrackValueOut(ar, s->layout.dense.items[pos]); + arFreePtr(s->layout.dense.items[pos]); + s->layout.dense.items[pos] = NULL; + s->count--; + ar->count--; + deleted++; + } + } + + /* Update max_idx if affected */ + if (s->count > 0 && s->layout.dense.max_idx >= eff_lo) { + /* Scan backwards to find new max */ + s->layout.dense.max_idx = s->layout.dense.offset; + for (int32_t i = (int32_t)win_hi; i >= (int32_t)win_lo; i--) { + if (!arIsEmpty(s->layout.dense.items[i - s->layout.dense.offset])) { + s->layout.dense.max_idx = i; + break; + } + } + } + } + + /* Delete slice if empty, or demote it to sparse if we are + * below the threshold. */ + if (s->count == 0) { + arSliceFree(ar, s); + arSetSlice(ar, slice_id, NULL); + } else { + arSetSlice(ar, slice_id, arDenseMaybeDemote(ar, s)); + } + } else { + /* Sparse: deleted elements form a contiguous span in the sorted + * offsets/values arrays. Find that span, free the values in it, + * then compact the tail once. */ + uint16_t *offsets = s->layout.sparse.offsets; + void **values = s->layout.sparse.values; + + int found; + uint32_t first = arSparseFindPos(s, (uint16_t)del_lo, &found); + uint32_t last = arSparseFindPos(s, (uint16_t)del_hi, &found); + if (found) last++; + + /* Free all values in the contiguous sparse span to delete. */ + for (uint32_t i = first; i < last; i++) { + arTrackValueOut(ar, values[i]); + arFreePtr(values[i]); + } + + /* Shift the surviving tail left to close the deleted gap. */ + if (first < last) { + uint32_t tail = s->count - last; + if (tail > 0) { + memmove(offsets + first, offsets + last, + tail * sizeof(uint16_t)); + memmove(values + first, values + last, + tail * sizeof(void *)); + } + + deleted = last - first; + s->count -= deleted; + ar->count -= deleted; + } + + if (s->count == 0) { + arSliceFree(ar, s); + arSetSlice(ar, slice_id, NULL); + } + } + + return deleted; +} + +/* Main range delete function: delete all elements in [lo..hi]. + * Returns number of elements deleted. + * + * Algorithm: + * 1. Compute slice boundaries + * 2. Handle boundary slices with element-level deletion + * 3. Delete full slices/blocks in between (O(1) per slice) + * 4. Update metadata (dir_highest_used, shrink directories) + * + * Complexity: O(S + N) where S = slices touched, N = boundary elements. + * Note that just looping with arGetSlice() and removing the in-the-middle + * slices one after the other would be much simpler but would have completely + * different complexity properties, in case of big span of empty indexes. */ +uint64_t arDeleteRange(redisArray *ar, uint64_t lo, uint64_t hi) { + if (ar->count == 0 || lo > hi) return 0; + + uint32_t slice_size = ar->slice_size; + uint64_t lo_slice = arSliceId(lo, slice_size); + uint64_t hi_slice = arSliceId(hi, slice_size); + uint32_t lo_off = arSliceOff(lo, slice_size); + uint32_t hi_off = arSliceOff(hi, slice_size); + + uint64_t deleted = 0; + int touched_highest = 0; /* Did we touch dir_highest_used? */ + + if (lo_slice == hi_slice) { + /* Range is within a single slice: element-level delete only */ + deleted = arDeleteSliceRange(ar, lo_slice, lo_off, hi_off); + if (lo_slice >= ar->dir_highest_used) touched_highest = 1; + } else { + /* Multiple slices: handle boundaries and full slices separately */ + + /* 1. Delete within lo_slice: [lo_off .. slice_size-1] */ + deleted += arDeleteSliceRange(ar, lo_slice, lo_off, slice_size - 1); + + /* 2. Delete within hi_slice: [0 .. hi_off] */ + deleted += arDeleteSliceRange(ar, hi_slice, 0, hi_off); + if (hi_slice >= ar->dir_highest_used) touched_highest = 1; + + /* 3. Delete full slices in between [lo_slice+1 .. hi_slice-1] */ + if (lo_slice + 1 <= hi_slice - 1) { + if (ar->superdir) { + /* Superdir mode: identify only the block entries that can + * contain slices in the middle range, then delete the covered + * slot interval inside each of those blocks. Iterate from high + * to low so removing an empty block does not invalidate the + * yet-to-be-visited entries. */ + uint64_t mid_lo = lo_slice + 1; + uint64_t mid_hi = hi_slice - 1; + uint64_t lo_block = mid_lo / AR_SUPER_BLOCK_SLOTS; + uint64_t hi_block = mid_hi / AR_SUPER_BLOCK_SLOTS; + + /* arSuperDirFind() is a lower-bound search on block_id. + * start is the first entry whose block_id is >= lo_block. + * end is the first entry whose block_id is > hi_block, so the + * blocks to visit are exactly [start, end). */ + int found; + uint32_t start = arSuperDirFind(ar, lo_block, &found); + uint32_t end = arSuperDirFind(ar, hi_block, &found); + if (found) end++; /* Convert matching index to past-the-end. */ + + /* Iterate backward because deleting the last slice in a block + * removes that block entry and compacts the superdir array. */ + for (int32_t bi = (int32_t)end - 1; bi >= (int32_t)start; bi--) { + arSDirEntry *e = ar->superdir + bi; + uint64_t block_base = e->block_id * AR_SUPER_BLOCK_SLOTS; + uint64_t block_end = block_base + AR_SUPER_BLOCK_SLOTS - 1; + + /* Convert the global middle slice range to the local slot + * interval covered inside this block. */ + uint32_t first_si = (mid_lo > block_base) ? + (uint32_t)(mid_lo - block_base) : 0; + uint32_t last_si = (mid_hi < block_end) ? + (uint32_t)(mid_hi - block_base) : AR_SUPER_BLOCK_SLOTS - 1; + + /* Delete each covered slice slot. The block itself, if it + * becomes empty, is removed after this local scan. */ + for (uint32_t si = first_si; si <= last_si; si++) { + if (e->slots[si]) { + uint64_t slice_id = block_base + si; + deleted += e->slots[si]->count; + ar->count -= e->slots[si]->count; + arSliceFree(ar, e->slots[si]); + e->slots[si] = NULL; + e->count--; + ar->num_slices--; + if (slice_id >= ar->dir_highest_used) + touched_highest = 1; + } + } + + /* Remove the superdir block if empty. */ + if (e->count == 0) { + arSuperDirRemoveBlock(ar, bi); + } + } + } else { + /* Flat mode: delete full slices in middle range */ + uint64_t end = hi_slice - 1; + if (end >= ar->dir_alloc) end = ar->dir_alloc - 1; + + for (uint64_t sid = lo_slice + 1; sid <= end; sid++) { + if (ar->dir[sid]) { + deleted += ar->dir[sid]->count; + ar->count -= ar->dir[sid]->count; + arSliceFree(ar, ar->dir[sid]); + ar->dir[sid] = NULL; + ar->num_slices--; + if (sid >= ar->dir_highest_used) touched_highest = 1; + } + } + } + } + } + + /* Update dir_highest_used if we touched or deleted high slices */ + if (touched_highest && ar->count > 0) { + ar->dir_highest_used = 0; + if (ar->superdir) { + for (int32_t bi = ar->sdir_len - 1; bi >= 0; bi--) { + arSDirEntry *e = ar->superdir + bi; + if (e->count == 0) continue; + for (int32_t si = AR_SUPER_BLOCK_SLOTS - 1; si >= 0; si--) { + if (e->slots[si]) { + ar->dir_highest_used = e->block_id * AR_SUPER_BLOCK_SLOTS + si; + goto found_highest; + } + } + } + } else { + for (int64_t i = (int64_t)ar->dir_alloc - 1; i >= 0; i--) { + if (ar->dir[i]) { + ar->dir_highest_used = i; + goto found_highest; + } + } + } + } +found_highest: + + if (ar->count == 0) { + ar->dir_highest_used = 0; + } + + arDirMaybeShrink(ar); + return deleted; +} + +/* Truncate array: delete all elements with index >= limit. + * Used by ARRING to implement ring buffer wrap-around. + * + * This is implemented as a special case of arDeleteRange. limit==0 means + * "delete everything". */ +void arTruncate(redisArray *ar, uint64_t limit) { + if (ar->count == 0) return; + + uint64_t len = arLen(ar); + if (limit >= len) return; /* Nothing to delete */ + + arDeleteRange(ar, limit, len - 1); +} + +/* ---------------------------------------------------------------------------- + * Properties + * -------------------------------------------------------------------------- */ + +/* Get count of non-empty elements */ +uint64_t arCount(redisArray *ar) { + return ar->count; +} + +/* Get logical length (max index + 1) */ +uint64_t arLen(redisArray *ar) { + if (ar->count == 0) return 0; + + arSlice *s = arGetSlice(ar, ar->dir_highest_used); + if (s == NULL) return 0; /* Defensive: if count>0 but slice missing, corrupted state */ + uint32_t local_max = arSliceMaxIdx(s); + return arMakeIdx(ar->dir_highest_used, local_max, ar->slice_size) + 1; +} + +/* ---------------------------------------------------------------------------- + * Range set optimization + * -------------------------------------------------------------------------- */ + +/* Pre-promote sparse slices to dense if a range set would overflow them. + * + * When ARSET writes many elements to a sparse slice, each insertion + * requires a sorted insert with memmove. If the slice eventually exceeds + * kmax elements, it gets promoted to dense anyway - wasting all that work. + * + * This function checks each slice touched by [lo, hi] and promotes it to + * dense upfront if the final element count would exceed kmax. Slices that + * will stay within kmax remain sparse. This way, bulk writes either go + * into sparse (if small) or dense (if large), but never do expensive + * sparse insertions followed by promotion. */ +void arMayPromoteToDenseForRangeSet(redisArray *ar, uint64_t lo, uint64_t hi) { + if (ArraySparseKMax == 0) return; /* Sparse disabled, nothing to do */ + + uint64_t slice_lo = arSliceId(lo, ar->slice_size); + uint64_t slice_hi = arSliceId(hi, ar->slice_size); + + /* Ensure directory can hold all slices we might touch */ + arDirGrow(ar, slice_hi); + + for (uint64_t sid = slice_lo; sid <= slice_hi; sid++) { + /* Compute offset range within this slice */ + uint64_t range_start = (sid == slice_lo) ? lo : (sid << arSliceBits(ar->slice_size)); + uint64_t range_end = (sid == slice_hi) ? hi : ((sid + 1) << arSliceBits(ar->slice_size)) - 1; + uint32_t start_off = arSliceOff(range_start, ar->slice_size); + uint32_t end_off = arSliceOff(range_end, ar->slice_size); + uint32_t range_size = end_off - start_off + 1; + + arSlice *s = arGetSlice(ar, sid); + + if (s == NULL) { + /* No slice yet - create dense directly if range exceeds kmax */ + if (range_size > ArraySparseKMax) { + arSetSlice(ar, sid, arSliceDenseNew(ar, start_off, ar->slice_size)); + } + continue; + } + + if (s->encoding == AR_SLICE_DENSE) continue; /* Already dense */ + + /* Sparse slice - check if we need to promote */ + if (range_size > ArraySparseKMax) { + /* Range alone exceeds kmax, must promote */ + arSetSlice(ar, sid, arSparsePromote(ar, s, ar->slice_size)); + continue; + } + + /* Count existing elements in [start_off, end_off] via linear scan. + * Sparse slices have at most kmax elements, so this is O(kmax). */ + uint16_t *offsets = s->layout.sparse.offsets; + uint32_t existing = 0; + for (uint32_t i = 0; i < s->count; i++) { + if (offsets[i] >= start_off && offsets[i] <= end_off) { + existing++; + } + } + + /* New elements = range_size - existing (slots we'll fill that are empty) */ + uint32_t new_elements = range_size - existing; + if (s->count + new_elements > ArraySparseKMax) { + arSetSlice(ar, sid, arSparsePromote(ar, s, ar->slice_size)); + } + } +} + +/* ---------------------------------------------------------------------------- + * Defragmentation + * -------------------------------------------------------------------------- */ + +/* Defrag one slice, fix the slice pointers that point inside its allocation + * and defrag the heap strings as well. + * + * If work is not NULL, also account for the slice scan performed here: + * dense slices add winsize, while sparse slices add count. We update the + * active defrag scanned statistic at the same time, so callers do not need + * to duplicate that logic. */ +static arSlice *arDefragSlice(arSlice *s, unsigned long *work, + void *(*defragfn)(void *)) { + /* 1. Try to defrag the slice itself. If the pointer changed, + * we need to also change the structure pointers pointing inside + * the allocation (that now has a different base address). */ + arSlice *new_s = defragfn(s); + if (new_s) { + s = new_s; + if (s->encoding == AR_SLICE_DENSE) + s->layout.dense.items = (void **)(s + 1); + else + arSparseSetupPointers(s); + } + + /* Defrag the arString() items. All the other items are + * encoded in the pointer value itself and need no handling. */ + if (s->encoding == AR_SLICE_DENSE) { + for (uint32_t j = 0; j < s->layout.dense.winsize; j++) { + if (!arIsPtr(s->layout.dense.items[j])) continue; + void *new_ptr = defragfn(s->layout.dense.items[j]); + if (new_ptr) s->layout.dense.items[j] = new_ptr; + } + if (work) { + *work += s->layout.dense.winsize; + server.stat_active_defrag_scanned += s->layout.dense.winsize; + } + } else { + void **values = s->layout.sparse.values; + for (uint32_t j = 0; j < s->count; j++) { + if (!arIsPtr(values[j])) continue; + void *new_ptr = defragfn(values[j]); + if (new_ptr) values[j] = new_ptr; + } + if (work) { + *work += s->count; + server.stat_active_defrag_scanned += s->count; + } + } + return s; +} + +/* Defrag the array header and the top-level directory object that points to + * slices. This is the cheap metadata pass done before we walk the slices + * themselves. */ +static redisArray *arDefragTopLevel(redisArray *ar, void *(*defragfn)(void *)) { + redisArray *new_ar = defragfn(ar); + if (new_ar) ar = new_ar; + + if (ar->superdir) { + arSDirEntry *new_sdir = defragfn(ar->superdir); + if (new_sdir) ar->superdir = new_sdir; + } else if (ar->dir) { + arSlice **new_dir = defragfn(ar->dir); + if (new_dir) ar->dir = new_dir; + } + return ar; +} + +/* Encode the next superdir scan position as a single cursor. + * Cursor 0 means "start from the beginning" and also "finished". + * + * On 64-bit builds we encode block_id and slot, so resume is stable even if + * blocks before the current one are inserted or removed between defrag steps. + * + * On 32-bit builds the generic defrag cursor type is only unsigned long, so + * it cannot always hold a full 64-bit block_id. In that case we fall back to + * the positional (block-index, slot) encoding. */ +static inline unsigned long arDefragSuperdirCursor(redisArray *ar, uint32_t bi, uint32_t si) { + serverAssert(si < AR_SUPER_BLOCK_SLOTS); +#if ULONG_MAX >= UINT64_MAX + uint64_t block_id = ar->superdir[bi].block_id; + serverAssert(block_id <= (ULONG_MAX - 1) / AR_SUPER_BLOCK_SLOTS); + return ((unsigned long)block_id * AR_SUPER_BLOCK_SLOTS + si) + 1; +#else + UNUSED(ar); + return ((unsigned long)bi * AR_SUPER_BLOCK_SLOTS + si) + 1; +#endif +} + +/* Decode the next superdir scan position stored in the incremental defrag + * cursor. */ +static void arDefragDecodeSuperdirCursor(redisArray *ar, unsigned long cursor, + uint32_t *bi, uint32_t *si) { + serverAssert(cursor > 0); + unsigned long pos = cursor - 1; +#if ULONG_MAX >= UINT64_MAX + /* Flat-mode cursors are also encoded as "slot + 1". After promotion to + * superdir, those old cursors still decode correctly here as block_id 0 + * with the same slot index, because flat mode only ever covers block 0 + * and arPromoteToSuperDir() copies the flat directory into block 0. */ + uint64_t block_id = pos / AR_SUPER_BLOCK_SLOTS; + int found; + + *si = pos % AR_SUPER_BLOCK_SLOTS; + *bi = arSuperDirFind(ar, block_id, &found); + if (!found) *si = 0; +#else + UNUSED(ar); + *bi = pos / AR_SUPER_BLOCK_SLOTS; + *si = pos % AR_SUPER_BLOCK_SLOTS; +#endif +} + +/* Defrag an array that is small enough that we can handle it + * in a single pass. */ +redisArray *arDefrag(redisArray *ar, void *(*defragfn)(void *)) { + ar = arDefragTopLevel(ar, defragfn); + + if (ar->superdir) { + /* Defrag each block slots array, then each slice referenced by it. */ + for (uint32_t bi = 0; bi < ar->sdir_len; bi++) { + arSDirEntry *e = ar->superdir + bi; + arSlice **new_slots = defragfn(e->slots); + if (new_slots) e->slots = new_slots; + + for (uint32_t si = 0; si < AR_SUPER_BLOCK_SLOTS; si++) { + if (e->slots[si] == NULL) continue; + e->slots[si] = arDefragSlice(e->slots[si], NULL, defragfn); + } + } + } else if (ar->dir) { + /* Defrag each slice referenced by the flat directory. */ + for (uint64_t i = 0; i < ar->dir_alloc; i++) { + if (ar->dir[i] == NULL) continue; + ar->dir[i] = arDefragSlice(ar->dir[i], NULL, defragfn); + } + } + + return ar; +} + +/* Incremental defrag step for arrays. Cursor 0 means "start from the + * beginning" and also "no more work". + * + * Work is counted explicitly in order to keep one call roughly aligned with + * active_defrag_max_scan_fields: + * + * 1. Visiting one flat directory entry costs 1. + * 2. In superdir mode, visiting one top-level block entry costs 1, and + * visiting one slot inside that block costs another 1. + * 3. Defragmenting a slice then adds the cost of scanning that slice: + * sparse slices add s->count, while dense slices add winsize. + * + * Slices are still defragmented as whole units. So a dense slice may cause one + * call to overshoot the configured budget, but we still stop immediately after + * that slice in order to resume from the next cursor position later. */ +unsigned long arDefragIncremental(redisArray **arref, unsigned long cursor, + void *(*defragfn)(void *)) +{ + redisArray *ar = *arref; + unsigned long work = 0; + unsigned long maxwork = server.active_defrag_max_scan_fields; + if (ar == NULL) return 0; + + if (cursor == 0) { + ar = arDefragTopLevel(ar, defragfn); + *arref = ar; + } + + if (ar->superdir) { + uint32_t bi = 0, si = 0; + if (cursor != 0) arDefragDecodeSuperdirCursor(ar, cursor, &bi, &si); + + for (; bi < ar->sdir_len; bi++, si = 0) { + arSDirEntry *e = ar->superdir + bi; + /* Defrag the block slots array once when we enter the block from + * its first slot. If we later resume in the middle of the same + * block, the slots array was already handled. */ + if (si == 0) { + arSlice **new_slots = defragfn(e->slots); + if (new_slots) e->slots = new_slots; + work++; + server.stat_active_defrag_scanned++; + } + + for (; si < AR_SUPER_BLOCK_SLOTS; si++) { + arSlice *s = e->slots[si]; + work++; + server.stat_active_defrag_scanned++; + + if (s == NULL) { + if (work > maxwork) { + si++; + if (si == AR_SUPER_BLOCK_SLOTS) { + bi++; + si = 0; + } + if (bi >= ar->sdir_len) return 0; + return arDefragSuperdirCursor(ar, bi, si); + } + continue; + } + + e->slots[si] = arDefragSlice(s, &work, defragfn); + + if (work > maxwork) { + si++; + if (si == AR_SUPER_BLOCK_SLOTS) { + bi++; + si = 0; + } + if (bi >= ar->sdir_len) return 0; + return arDefragSuperdirCursor(ar, bi, si); + } + } + } + return 0; + } + + if (ar->dir == NULL) return 0; + + uint64_t i = (cursor == 0) ? 0 : cursor - 1; + for (; i < ar->dir_alloc; i++) { + arSlice *s = ar->dir[i]; + work++; + server.stat_active_defrag_scanned++; + + if (s == NULL) { + if (work > maxwork) { + i++; + if (i >= ar->dir_alloc) return 0; + return i + 1; + } + continue; + } + + ar->dir[i] = arDefragSlice(s, &work, defragfn); + + if (work > maxwork) { + i++; + if (i >= ar->dir_alloc) return 0; + return i + 1; + } + } + return 0; +} diff --git a/src/sparsearray.h b/src/sparsearray.h new file mode 100644 index 000000000..c0444ee7b --- /dev/null +++ b/src/sparsearray.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2026-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + * + * Sparse Array - A memory-efficient sparse array with 64-bit index space. + * + * This data structure was designed and implemented by Salvatore Sanfilippo. + */ + +#ifndef __SPARSEARRAY_H +#define __SPARSEARRAY_H + +#include +#include +#include + +/* ============================================================================ + * SPARSE ARRAY OVERVIEW + * ============================================================================ + * + * Sparse arrays are random-access sequences indexed by non-negative 64-bit + * integers. They support O(1) get/set operations and efficient iteration. + * + * MEMORY LAYOUT + * ------------- + * The array uses a two-level structure: a directory pointing to "slices", + * which contain just a range of elements. For very large/sparse arrays, a + * three-level "superdir" structure is used. + * + * SLICE TYPES + * ----------- + * Each slice holds up to slice_size elements and can be: + * + * - Sparse: Sorted array of (offset, value) pairs. Memory-efficient when + * elements are scattered within the slice. + * + * - Dense: Contiguous array with a sliding window. Used when the slice + * has many elements. + * + * VALUE ENCODING (Tagged Pointers) + * -------------------------------- + * Values are stored in tagged pointer-sized words, using the low 2 bits as a + * tag. The exact immediate encoding depends on pointer width: + * + * 64-bit builds: + * Tag 00: arString pointer (heap-allocated, 8+ byte strings) + * Tag 01: Immediate signed integer in the 62-bit payload + * Tag 10: Immediate double (low 2 bits of the IEEE-754 payload cleared) + * Tag 11: Inline small string (0-7 bytes) + * + * 32-bit builds: + * Tag 00: arString pointer + * Tag 01: Immediate signed integer in the 30-bit payload + * Tag 10: Immediate float (low 2 bits of the IEEE-754 payload cleared) + * Tag 11: Inline small string (0-3 bytes) + * + * RDB persistence is architecture-neutral: values are saved as logical ints, + * doubles and strings, never as raw tagged words. + * ========================================================================== */ + +/* ---------------------------------------------------------------------------- + * Configuration defaults + * -------------------------------------------------------------------------- */ + +#define AR_SLICE_SIZE_DEFAULT 4096 +#define AR_SLICE_SIZE_MIN 256 +#define AR_SLICE_SIZE_MAX 65536 +#define AR_SPARSE_KMAX_DEFAULT 10 +#define AR_SPARSE_KMIN_DEFAULT 5 + +/* Superdir: fixed-size blocks of slice pointers. Each block holds 2048 + * pointers to actual array slices, which uses about 8 KB on 32-bit builds + * and 16 KB on 64-bit builds. This keeps very large indices from forcing + * catastrophic flat-directory growth. */ +#define AR_SUPER_BLOCK_SLOTS 2048 + +/* Internal constants */ +#define AR_SLICE_MIN_ALLOC 8 /* Initial dense window allocation */ +#define AR_INSERT_IDX_NONE UINT64_MAX /* No insert performed yet */ + +/* Slice encoding types */ +#define AR_SLICE_DENSE 0 +#define AR_SLICE_SPARSE 1 + +/* Tagged value encoding (low 2 bits). NULL (0) means empty slot. */ +#define AR_TAG_PTR ((uintptr_t)0) /* arString pointer (low 2 bits = 00) */ +#define AR_TAG_INT ((uintptr_t)1) /* Immediate signed integer (01) */ +#define AR_TAG_FLOAT ((uintptr_t)2) /* Immediate float (10) */ +#define AR_TAG_STR ((uintptr_t)3) /* Inline small string (11) */ +#define AR_TAG_MASK ((uintptr_t)3) + +#if UINTPTR_MAX == UINT64_MAX +#define AR_SMALLSTR_MAXLEN 7 +#define AR_SMALLSTR_LEN_MASK 0x7u +#elif UINTPTR_MAX == UINT32_MAX +#define AR_SMALLSTR_MAXLEN 3 +#define AR_SMALLSTR_LEN_MASK 0x3u +#else +#error "Unsupported pointer size" +#endif + +/* RDB type tags for array elements */ +#define AR_RDB_TAG_SDS 0 +#define AR_RDB_TAG_INT 1 +#define AR_RDB_TAG_FLOAT 2 +#define AR_RDB_TAG_SMALLSTR 3 + +/* Buffer size for inline types (int/float/smallstr) */ +#define AR_INLINE_BUFSIZE 64 + +/* ---------------------------------------------------------------------------- + * Data structures + * -------------------------------------------------------------------------- */ + +/* Array slice: holds a range of elements. Single allocation with payload. */ +typedef struct arSlice { + uint8_t encoding; /* 0=dense, 1=sparse */ + uint8_t _pad1[3]; + uint32_t count; /* Non-empty items in this slice */ + union { + struct { + uint32_t offset; /* First logical offset in window */ + uint32_t winsize; /* Window size (power of two) */ + uint32_t max_idx; /* Highest offset with a value */ + void **items; /* Points into payload */ + } dense; + struct { + uint32_t cap; /* Capacity */ + uint16_t *offsets; /* Points into payload */ + void **values; /* Points into payload (aligned) */ + } sparse; + } layout; +} arSlice; + +/* Super-directory entry: groups slices into fixed-size pointer blocks. */ +typedef struct arSDirEntry { + uint64_t block_id; /* slice_id / AR_SUPER_BLOCK_SLOTS */ + uint32_t count; /* Non-NULL slots in this block */ + uint32_t _pad; + arSlice **slots; /* AR_SUPER_BLOCK_SLOTS pointers to slices */ +} arSDirEntry; + +/* Array header */ +typedef struct redisArray { + uint64_t count; /* Total non-empty items */ + uint64_t insert_idx; /* Last insert index, or UINT64_MAX if none */ + uint64_t dir_alloc; /* Flat directory length (flat mode) */ + uint64_t dir_highest_used; /* Highest non-NULL slice index */ + uint64_t num_slices; /* Number of allocated slices */ + size_t alloc_size; /* Tracked total allocation (for slot stats) */ + uint32_t slice_size; /* Slice size (power of two) */ + uint32_t sdir_len; /* Superdir entries count */ + uint32_t sdir_cap; /* Superdir capacity */ + uint32_t _pad; + arSlice **dir; /* Flat directory or NULL */ + arSDirEntry *superdir; /* Super-directory or NULL */ +} redisArray; + +/* ---------------------------------------------------------------------------- + * Inline helpers: index arithmetic + * -------------------------------------------------------------------------- */ + +/* Compute bits needed to address elements within a slice. */ +static inline int arSliceBits(uint32_t slice_size) { + if (slice_size == 4096) return 12; /* Fast path for default */ + int bits = 0; + uint32_t x = slice_size; + while (x > 1) { x >>= 1; bits++; } + return bits; +} + +static inline uint64_t arSliceId(uint64_t idx, uint32_t slice_size) { + return idx >> arSliceBits(slice_size); +} + +static inline uint32_t arSliceOff(uint64_t idx, uint32_t slice_size) { + return (uint32_t)(idx & (slice_size - 1)); +} + +static inline uint64_t arMakeIdx(uint64_t slice_id, uint32_t off, uint32_t slice_size) { + return (slice_id << arSliceBits(slice_size)) | off; +} + +/* ---------------------------------------------------------------------------- + * Inline helpers: tagged value encoding + * -------------------------------------------------------------------------- */ + +static inline int arIsEmpty(void *v) { return v == NULL; } + +static inline int arIsPtr(void *v) { + return v != NULL && ((uintptr_t)v & AR_TAG_MASK) == AR_TAG_PTR; +} + +static inline int arIsInt(void *v) { + return ((uintptr_t)v & AR_TAG_MASK) == AR_TAG_INT; +} + +static inline int64_t arToInt(void *v) { + return (int64_t)(intptr_t)v >> 2; /* Arithmetic shift preserves sign */ +} + +static inline void *arFromInt(int64_t ival) { + return (void *)(((uintptr_t)ival << 2) | AR_TAG_INT); +} + +static inline int arIntFits(int64_t ival) { +#if UINTPTR_MAX == UINT64_MAX + return ival >= -(1LL << 61) && ival <= (1LL << 61) - 1; +#else + return ival >= -(1LL << 29) && ival <= (1LL << 29) - 1; +#endif +} + +static inline int arIsFloat(void *v) { + return ((uintptr_t)v & AR_TAG_MASK) == AR_TAG_FLOAT; +} + +static inline double arToDouble(void *v) { +#if UINTPTR_MAX == UINT64_MAX + uint64_t bits = (uintptr_t)v & ~AR_TAG_MASK; + double d; + memcpy(&d, &bits, sizeof(d)); + return d; +#else + uint32_t bits = (uint32_t)((uintptr_t)v & ~(uintptr_t)AR_TAG_MASK); + float f; + memcpy(&f, &bits, sizeof(f)); + return (double)f; +#endif +} + +static inline void *arFromFloatBits(uint64_t bits_trunc) { +#if UINTPTR_MAX == UINT64_MAX + return (void *)((bits_trunc & ~AR_TAG_MASK) | AR_TAG_FLOAT); +#else + uint32_t bits32 = (uint32_t)bits_trunc; + return (void *)(uintptr_t)((bits32 & ~(uint32_t)AR_TAG_MASK) | AR_TAG_FLOAT); +#endif +} + +static inline int arIsSmallStr(void *v) { + return ((uintptr_t)v & AR_TAG_MASK) == AR_TAG_STR; +} + +static inline int arSmallStrLen(void *v) { + return (int)(((uintptr_t)v >> 2) & AR_SMALLSTR_LEN_MASK); +} + +static inline int arToSmallStr(void *v, char *buf) { + int len = arSmallStrLen(v); + uintptr_t val = (uintptr_t)v; + for (int i = 0; i < len; i++) { + buf[i] = (char)((val >> (8 * (i + 1))) & 0xFF); + } + buf[len] = '\0'; + return len; +} + +static inline void *arFromSmallStr(const char *s, int len) { + uintptr_t v = AR_TAG_STR | ((uintptr_t)len << 2); + for (int i = 0; i < len; i++) { + v |= ((uintptr_t)(uint8_t)s[i]) << (8 * (i + 1)); + } + return (void *)v; +} + +/* ---------------------------------------------------------------------------- + * Public API + * -------------------------------------------------------------------------- */ + +/* Lifecycle */ +redisArray *arNew(void); +void arFree(redisArray *ar); +redisArray *arDup(redisArray *ar); +void arDismiss(redisArray *ar, size_t size_hint); + +/* Element access */ +void *arGet(redisArray *ar, uint64_t idx); +void arSet(redisArray *ar, uint64_t idx, void *v); +int arDel(redisArray *ar, uint64_t idx); + +/* Value encoding/decoding */ +void *arEncode(const char *s, size_t len); +const char *arDecode(void *v, char *buf, size_t bufsize, size_t *outlen); +int arFormatFloat(double d, char *buf, size_t bufsize); +size_t arStringLen(const void *ptr); +const char *arStringData(const void *ptr); +void *arValueFromRdbInt(int64_t ival); +void *arValueFromRdbFloat(double d); +void *arValueFromRdbSmallStr(const char *s, size_t len); + +/* Queries */ +uint64_t arCount(redisArray *ar); +uint64_t arLen(redisArray *ar); + +/* Bulk operations */ +uint64_t arDeleteRange(redisArray *ar, uint64_t lo, uint64_t hi); +void arTruncate(redisArray *ar, uint64_t limit); +void arMayPromoteToDenseForRangeSet(redisArray *ar, uint64_t lo, uint64_t hi); + +/* Utilities */ +uint32_t arSparseFindPos(arSlice *s, uint16_t rel_idx, int *found); +uint32_t arSuperDirFind(redisArray *ar, uint64_t block_id, int *found); +redisArray *arDefrag(redisArray *ar, void *(*defragfn)(void *)); +unsigned long arDefragIncremental(redisArray **arref, unsigned long cursor, + void *(*defragfn)(void *)); + +#endif /* __SPARSEARRAY_H */ diff --git a/src/t_array.c b/src/t_array.c new file mode 100644 index 000000000..4fb72f8da --- /dev/null +++ b/src/t_array.c @@ -0,0 +1,2021 @@ +/* + * Copyright (c) 2026-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + * + * Redis Array commands implementation. + * Originally authored by: Salvatore Sanfilippo. + * + * The core sparse array data structure is in sparsearray.c/sparsearray.h. + * This file contains Redis command handlers and Redis-specific operations. + */ + +#include "server.h" +#include "../deps/tre/local_includes/tre.h" +#include + +/****************************************************************************** + * + * ARRAY COMMANDS AND HIGHER LEVEL LOGIC + * + * This section contains all the Redis commands for the Array type, as well + * as the type operations used by COPY and other server-level functionality. + * + *****************************************************************************/ + +/* ---------------------------------------------------------------------------- + * Array type operations for COPY command + * -------------------------------------------------------------------------- */ + +robj *arrayTypeDup(robj *o) { + redisArray *ar = o->ptr; + redisArray *dup = arDup(ar); + robj *newobj = createObject(OBJ_ARRAY, dup); + newobj->encoding = OBJ_ENCODING_SLICED_ARRAY; + return newobj; +} + +/* ---------------------------------------------------------------------------- + * Internal helpers + * -------------------------------------------------------------------------- */ + +#define ARGETRANGE_MAX_ITEMS 1000000 + +/* Lookup array object for write, create it if missing, or reply with + * WRONGTYPE and return NULL if the key holds a different type. */ +robj *lookupArrayForWriteOrReply(client *c, robj *key) { + robj *o = lookupKeyWrite(c->db, key); + if (o == NULL) { + o = createArrayObject(); + dbAdd(c->db, key, &o); + } else if (checkType(c, o, OBJ_ARRAY)) { + return NULL; + } + return o; +} + +/* Reply with an array value. This helper is needed because we used + * tagged pointers for inlining values like floats, integers, small + * strings directly inside the pointer. Big memory saves, but more + * work needed when there is to reply to the client. */ +void addReplyArrayValue(client *c, void *v) { + if (arIsEmpty(v)) { + addReplyNull(c); + return; + } + + char buf[AR_INLINE_BUFSIZE]; + size_t len; + const char *data = arDecode(v, buf, sizeof(buf), &len); + addReplyBulkCBuffer(c, data, len); +} + +/* Parse array index from object. Accepts 0 to 2^64-2 by default. + * If allow_max is true, also accepts UINT64_MAX. This is used by ARSEEK + * because ARSEEK UINT64_MAX sets insert_idx to UINT64_MAX-1, which is + * a valid terminal state (next ARINSERT would overflow). + * Returns C_OK/C_ERR. Does NOT send error reply - caller decides. */ +int getArrayIndexFromObject(robj *o, uint64_t *idx, int allow_max) { + unsigned long long ull; + if (o->encoding == OBJ_ENCODING_INT) { + if ((long)o->ptr < 0) return C_ERR; + ull = (unsigned long long)(long)o->ptr; + } else { + if (!string2ull(o->ptr, &ull)) return C_ERR; + } + if (ull == UINT64_MAX && !allow_max) return C_ERR; + *idx = ull; + return C_OK; +} + +/* Parse an array index argument and reply with an error on failure. */ +int arrayParseIndexOrReply(client *c, robj *arg, uint64_t *idx) { + if (getArrayIndexFromObject(arg, idx, 0) != C_OK) { + addReplyError(c, "invalid array index"); + return C_ERR; + } + return C_OK; +} + +/* ---------------------------------------------------------------------------- + * ARGET / ARMGET + * -------------------------------------------------------------------------- */ + +/* ARGET key idx + * + * Returns the value at idx in O(1). + * Missing keys and holes both reply with NULL. */ +void argetCommand(client *c) { + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o && checkType(c, o, OBJ_ARRAY)) return; + + uint64_t idx; + if (arrayParseIndexOrReply(c, c->argv[2], &idx) != C_OK) return; + + void *v = o ? arGet(o->ptr, idx) : NULL; + addReplyArrayValue(c, v); +} + +/* ARMGET key idx [idx ...] + * + * Returns the values at the requested indices in O(N), where N is the number + * of indices. Missing keys and holes reply with NULLs. All indices are + * validated before the reply starts, so malformed input fails atomically. */ +void armgetCommand(client *c) { + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o != NULL && checkType(c, o, OBJ_ARRAY)) return; + + /* Pre-validate all indices so malformed input fails the whole command, + * like the other array commands. */ + for (int i = 2; i < c->argc; i++) { + uint64_t idx; + if (arrayParseIndexOrReply(c, c->argv[i], &idx) != C_OK) return; + } + + addReplyArrayLen(c, c->argc - 2); + + for (int i = 2; i < c->argc; i++) { + if (o == NULL) { + /* Non existing keys are semantically equivalent + * to non existing indexes of existing arrays. */ + addReplyNull(c); + continue; + } + + uint64_t idx = 0; + getArrayIndexFromObject(c->argv[i], &idx, 0); /* Already validated. */ + + redisArray *ar = o->ptr; + void *v = arGet(ar, idx); + addReplyArrayValue(c, v); + } +} + +/* ---------------------------------------------------------------------------- + * ARSET / ARMSET + * -------------------------------------------------------------------------- */ + +/* ARSET key [value ...] + * + * Sets one or more contiguous values in O(N), where N is the number of + * values. Creates the array if needed and returns the number of previously + * empty slots that were filled. */ + void arsetCommand(client *c) { + uint64_t start_idx; + if (arrayParseIndexOrReply(c, c->argv[2], &start_idx) != C_OK) return; + + int num_values = c->argc - 3; + + /* Pre-validate: check for overflow and forbidden max index. */ + uint64_t last_idx = start_idx + (uint64_t)num_values - 1; + if (last_idx < start_idx || last_idx == UINT64_MAX) { + addReplyError(c, "array index overflow"); + return; + } + + robj *o = lookupArrayForWriteOrReply(c, c->argv[1]); + if (o == NULL) return; + + redisArray *ar = o->ptr; + uint64_t old_count = arCount(ar); + size_t old_alloc = 0; + if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); + + /* Pre-promote sparse slices only for true bulk sets. A single-element + * write does not benefit from the extra range-analysis pass. */ + if (num_values > 1) + arMayPromoteToDenseForRangeSet(ar, start_idx, last_idx); + + /* Write all values starting at start_idx */ + uint64_t idx = start_idx; + for (int i = 3; i < c->argc; i++) { + sds val = c->argv[i]->ptr; + void *v = arEncode(val, sdslen(val)); + arSet(ar, idx, v); + idx++; + } + + long long set_count = arCount(ar) - old_count; + updateKeysizesHist(c->db, OBJ_ARRAY, old_count, arCount(ar)); + if (server.memory_tracking_enabled) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + keyModified(c, c->db, c->argv[1], o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "arset", c->argv[1], c->db->id); + server.dirty += num_values; + addReplyLongLong(c, set_count); +} + +/* ARMSET key idx value [idx value ...] + * + * Sets multiple scattered index/value pairs in O(N), where N is the number of + * pairs. Creates the array if needed, returns the number of newly filled + * slots, and validates all indices before mutating. */ +void armsetCommand(client *c) { + if ((c->argc - 2) % 2 != 0) { + addReplyErrorArity(c); + return; + } + + /* Validate all indices first */ + for (int i = 2; i < c->argc; i += 2) { + uint64_t idx; + if (arrayParseIndexOrReply(c, c->argv[i], &idx) != C_OK) return; + } + + robj *o = lookupArrayForWriteOrReply(c, c->argv[1]); + if (o == NULL) return; + + redisArray *ar = o->ptr; + uint64_t old_count = arCount(ar); + size_t old_alloc = 0; + if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); + + for (int i = 2; i < c->argc; i += 2) { + uint64_t idx = 0; + getArrayIndexFromObject(c->argv[i], &idx, 0); /* Already validated */ + + sds val = c->argv[i + 1]->ptr; + void *v = arEncode(val, sdslen(val)); + arSet(ar, idx, v); + } + + int num_pairs = (c->argc - 2) / 2; + long long set_count = arCount(ar) - old_count; + updateKeysizesHist(c->db, OBJ_ARRAY, old_count, arCount(ar)); + if (server.memory_tracking_enabled) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + keyModified(c, c->db, c->argv[1], o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "armset", c->argv[1], c->db->id); + server.dirty += num_pairs; + addReplyLongLong(c, set_count); +} + +/* ---------------------------------------------------------------------------- + * ARDEL / ARDELRANGE + * -------------------------------------------------------------------------- */ + +/* ARDEL key idx [idx ...] + * + * Deletes the specified indices in O(N), where N is the number of indices. + * All indices are validated first, and if the array becomes empty the key + * itself is deleted. The number of deleted (existing) items is returned. */ +void ardelCommand(client *c) { + /* Pre-validate all indices before mutating, to report syntax errors + * even if the key doesn't exist. */ + for (int i = 2; i < c->argc; i++) { + uint64_t idx; + if (arrayParseIndexOrReply(c, c->argv[i], &idx) != C_OK) return; + } + + robj *o = lookupKeyWrite(c->db, c->argv[1]); + if (o == NULL) { + addReplyLongLong(c, 0); + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + uint64_t old_count = arCount(ar); + size_t old_alloc = 0; + if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); + long long deleted = 0; + + for (int i = 2; i < c->argc; i++) { + uint64_t idx = 0; + getArrayIndexFromObject(c->argv[i], &idx, 0); /* Already validated */ + deleted += arDel(ar, idx); + } + + int keyremoved = (arCount(ar) == 0); + if (server.memory_tracking_enabled && deleted > 0 && keyremoved) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + if (deleted > 0) { + if (keyremoved) + dbDeleteSkipKeysizesUpdate(c->db, c->argv[1]); + updateKeysizesHist(c->db, OBJ_ARRAY, + old_count, keyremoved ? -1 : (int64_t)arCount(ar)); + if (server.memory_tracking_enabled && !keyremoved) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + keyModified(c, c->db, c->argv[1], keyremoved ? NULL : o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "ardel", c->argv[1], c->db->id); + if (keyremoved) + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", c->argv[1], c->db->id); + server.dirty += deleted; + } + addReplyLongLong(c, deleted); +} + +/* ARDELRANGE key start end [start end ...] + * + * Deletes one or more ranges. Complexity is proportional to the existing + * elements / slices touched, not to the numeric span of the requested ranges, + * so huge ranges do not block the server forever. + * + * Each pair may be given in either order. All ranges are validated up front, + * and an empty resulting array deletes the key. */ +void ardelrangeCommand(client *c) { + if ((c->argc - 2) % 2 != 0) { + addReplyErrorArity(c); + return; + } + + /* Pre-validate all ranges before mutating, to avoid partial updates + * if a later range has invalid syntax. */ + for (int i = 2; i < c->argc; i += 2) { + uint64_t start, end; + if (arrayParseIndexOrReply(c, c->argv[i], &start) != C_OK) return; + if (arrayParseIndexOrReply(c, c->argv[i + 1], &end) != C_OK) return; + } + + robj *o = lookupKeyWrite(c->db, c->argv[1]); + if (o == NULL) { + addReplyLongLong(c, 0); + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + uint64_t old_count = arCount(ar); + size_t old_alloc = 0; + if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); + uint64_t total_deleted = 0; + + /* Process each range using the generalized arDeleteRange */ + for (int i = 2; i < c->argc; i += 2) { + uint64_t start = 0, end = 0; + getArrayIndexFromObject(c->argv[i], &start, 0); /* Already validated */ + getArrayIndexFromObject(c->argv[i + 1], &end, 0); + + uint64_t lo = (start <= end) ? start : end; + uint64_t hi = (start <= end) ? end : start; + + total_deleted += arDeleteRange(ar, lo, hi); + } + + int keyremoved = (arCount(ar) == 0); + if (server.memory_tracking_enabled && total_deleted > 0 && keyremoved) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + if (total_deleted > 0) { + if (keyremoved) + dbDeleteSkipKeysizesUpdate(c->db, c->argv[1]); + updateKeysizesHist(c->db, OBJ_ARRAY, + old_count, keyremoved ? -1 : (int64_t)arCount(ar)); + if (server.memory_tracking_enabled && !keyremoved) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + keyModified(c, c->db, c->argv[1], keyremoved ? NULL : o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "ardelrange", c->argv[1], c->db->id); + if (keyremoved) + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", c->argv[1], c->db->id); + server.dirty += total_deleted; + } + addReplyUnsignedLongLong(c, total_deleted); +} + +/* ---------------------------------------------------------------------------- + * ARLEN / ARCOUNT + * -------------------------------------------------------------------------- */ + +/* ARLEN key + * + * Returns max-index-plus-one in O(1). + * Missing keys reply with 0. */ +void arlenCommand(client *c) { + robj *o = lookupKeyReadOrReply(c, c->argv[1], shared.czero); + if (o == NULL || checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + addReplyUnsignedLongLong(c, arLen(ar)); +} + +/* ARCOUNT key + * + * Returns the number of non-empty elements in O(1). + * Missing keys reply with 0. */ +void arcountCommand(client *c) { + robj *o = lookupKeyReadOrReply(c, c->argv[1], shared.czero); + if (o == NULL || checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + addReplyUnsignedLongLong(c, arCount(ar)); +} + +/* ---------------------------------------------------------------------------- + * ARGETRANGE + * -------------------------------------------------------------------------- */ + +/* ARGETRANGE key start end + * + * Returns every position in the requested range in O(N), where N is the range + * length. Holes are returned as NULLs, and a missing key behaves like an all- + * NULL array. If start > end the reply order is reversed. + * + * To avoid giant synthetic NULL replies, the range length is hard-limited, + * otherwise the command, with a wrong range, could make the server totally + * unusable. The max range is 1 million elements and is fixed, constituting + * the user-facing semantic of the command. */ +void argetrangeCommand(client *c) { + uint64_t start, end; + if (arrayParseIndexOrReply(c, c->argv[2], &start) != C_OK) return; + if (arrayParseIndexOrReply(c, c->argv[3], &end) != C_OK) return; + + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o != NULL && checkType(c, o, OBJ_ARRAY)) return; + + int reverse = start > end; + uint64_t lo = reverse ? end : start; + uint64_t hi = reverse ? start : end; + uint64_t len = hi - lo + 1; + + /* ARGETRANGE is a special command: it can trigger a huge reply blocking + * the server (basically forever) even if there is no actual data. This + * is unlike an SMEMBERS against a very big key: somebody added so many + * elements inside, before asking for a huge amount of elements. But, in the + * case of ARGETRANGE, you can just trigger a huge amount of NULLs to be + * sent to the client. In ARSCAN this was optimized to be O(N) with the + * actual populated elements, but in this case it can't be done because + * of the semantic of the command, and the Redis protocol inability to reply + * with run-length ranges (a, b, c, 1293455 NULLs, d, e). + * + * Because of all that, we put an hard limit in the range size, and this + * limit must be part of the Redis culture, so it should not be tuned in + * any way: 1 million items, with an hard error if the range is bigger than + * that, not just a silent trimming at this length, that would cause hard + * to track bugs. */ + if (len > ARGETRANGE_MAX_ITEMS) { + addReplyErrorFormat(c, "range exceeds maximum of %u items", + ARGETRANGE_MAX_ITEMS); + return; + } + + addReplyArrayLen(c, len); + if (o == NULL) { + for (uint64_t i = 0; i < len; i++) addReplyNull(c); + return; + } + + redisArray *ar = o->ptr; + if (reverse) { + for (uint64_t idx = hi; ; idx--) { + void *v = arGet(ar, idx); + addReplyArrayValue(c, v); + if (idx == lo) break; + } + } else { + for (uint64_t idx = lo; idx <= hi; idx++) { + void *v = arGet(ar, idx); + addReplyArrayValue(c, v); + } + } +} + +/* ---------------------------------------------------------------------------- + * ARSCAN + * -------------------------------------------------------------------------- */ + +/* Iterate populated elements in [start..end]. + * + * This iterator is read-only and not mutation-stable: between Init() and the + * final Next() that returns 0, the caller must not write to the array. Any + * write may free or relocate the current slice, making the iterator state + * stale. The goal of this abstraction was to capture repeated code in the + * implementation of ARSCAN, ARGREP, AROP. + * + * The struct lives on the caller stack, so setup and iteration stay allocation + * free and command-local. */ +typedef struct { + redisArray *ar; + uint64_t lo; /* Normalized inclusive range start. */ + uint64_t hi; /* Normalized inclusive range end. */ + uint64_t lo_slice; /* First slice touched by the range. */ + uint64_t hi_slice; /* Last slice touched by the range. */ + uint32_t slice_size; /* Cached slice size. */ + int reverse; /* Iterate from high to low. */ + int32_t step; /* +1 forward, -1 backward. */ + int done; /* No more elements to return. */ + int top_done; /* No more slices to inspect after current. */ + + uint64_t slice_id; /* Next flat-directory slice to inspect. */ + int32_t sdir_index; /* Next superdir entry to inspect. */ + int32_t slot_index; /* Next slot inside the current superdir entry. */ + + arSlice *slice; /* Slice currently being scanned. */ + uint64_t slice_base; /* Logical index of slice offset 0. */ + uint32_t off_lo; /* First in-range offset for current slice. */ + uint32_t off_hi; /* Last in-range offset for current slice. */ + int dense; /* Current slice is dense. */ + void **dense_items; /* Dense items window. */ + int32_t dense_off; /* Current dense logical offset. */ + int32_t dense_item_pos; /* Current dense window index. */ + int32_t dense_item_end; /* Final dense window index. */ + uint16_t *sparse_offsets; /* Sparse offsets array. */ + void **sparse_values; /* Sparse values array. */ + int32_t sparse_count; /* Sparse entry count. */ + int32_t sparse_pos; /* Current sparse entry position. */ + int slice_ready; /* Current slice scan state is initialized. */ +} arScanIter; + +#define AR_SCAN_ITER_SLOT_UNSET INT32_MIN + +/* Keep the per-element iterator hot path inline in the command loops. + * It helps a lot with certain targets, up to ~30-50% speed regression + * without forcing the inlining. */ +#if defined(__GNUC__) || defined(__clang__) +#define ALWAYS_INLINE __attribute__((always_inline)) inline +#else +#define ALWAYS_INLINE inline +#endif + +/* Initialize a populated-elements iterator. Empty arrays and empty clipped + * ranges are turned into a done iterator here so the first Next() is a single + * branch. */ +static void arScanIterInit(redisArray *ar, uint64_t start, uint64_t end, + arScanIter *it) +{ + memset(it, 0, sizeof(*it)); + it->ar = ar; + + if (ar == NULL || arCount(ar) == 0) { + it->done = 1; + it->top_done = 1; + return; + } + + /* Note that a few things here could be taken + * from the array itself, as they are immutable, + * but after introducing this abstraction a small + * but measurable speed regression suggested to + * micro-optimize for this hot path and have + * iterator-side copies of often used stuff. */ + it->reverse = start > end; + it->step = it->reverse ? -1 : 1; + it->lo = it->reverse ? end : start; + it->hi = it->reverse ? start : end; + it->slice_size = ar->slice_size; + it->lo_slice = it->lo / it->slice_size; + it->hi_slice = it->hi / it->slice_size; + it->slot_index = AR_SCAN_ITER_SLOT_UNSET; + + /* No intersection between the range and the array span. */ + if (it->lo_slice > ar->dir_highest_used) { + it->done = 1; + it->top_done = 1; + return; + } + + /* Clip the high end to the actual array span. */ + if (it->hi_slice > ar->dir_highest_used) { + it->hi_slice = ar->dir_highest_used; + it->hi = arMakeIdx(it->hi_slice, it->slice_size - 1, it->slice_size); + } + + /* Clipping made the range empty? */ + if (it->lo_slice > it->hi_slice) { + it->done = 1; + it->top_done = 1; + return; + } + + if (ar->superdir) { + int found; + + /* Start from the first superdir block that can intersect the range. */ + uint64_t block_id = (it->reverse ? it->hi_slice : it->lo_slice) / + AR_SUPER_BLOCK_SLOTS; + uint32_t pos = arSuperDirFind(ar, block_id, &found); + + if (it->reverse) it->sdir_index = found ? (int32_t)pos : (int32_t)pos - 1; + else it->sdir_index = (int32_t)pos; + + /* No superdir block intersects the clipped range. */ + if (it->sdir_index < 0 || it->sdir_index >= (int32_t)ar->sdir_len) { + it->done = 1; + it->top_done = 1; + } + } else { + /* Flat directory iteration starts directly from the first in-range slice. */ + it->slice_id = it->reverse ? it->hi_slice : it->lo_slice; + } +} + +/* Prepare the current slice-local scan state. Returns 1 if the slice may + * yield at least one populated element in range, otherwise 0. + * The function is used by arScanIterLoadNextSlice() each time a new + * slice should be iterated. When a new slice is selected by + * arScanIterLoadNextSlice(), then this function is called to setup the + * iteration needed by arScanIterNext(). */ +static ALWAYS_INLINE int arScanIterPrepareSlice(arScanIter *it, + arSlice *s, uint64_t slice_id) +{ + uint64_t slice_base = slice_id * it->slice_size; + /* Restrict the scan to the part of this slice touched by the query. */ + uint32_t off_lo = (slice_id == it->lo_slice) ? + arSliceOff(it->lo, it->slice_size) : 0; + uint32_t off_hi = (slice_id == it->hi_slice) ? + arSliceOff(it->hi, it->slice_size) : it->slice_size - 1; + + if (s->encoding == AR_SLICE_DENSE) { + uint32_t win_lo = s->layout.dense.offset; + uint32_t win_hi = s->layout.dense.offset + s->layout.dense.winsize - 1; + + /* Dense slices may only have a smaller populated window allocated. */ + if (off_lo < win_lo) off_lo = win_lo; + if (off_hi > win_hi) off_hi = win_hi; + + /* No intersection between the range and the dense window. */ + if (off_lo > off_hi) return 0; + + it->dense = 1; + it->dense_items = s->layout.dense.items; + it->dense_off = it->reverse ? (int32_t)off_hi : (int32_t)off_lo; + it->dense_item_pos = it->dense_off - (int32_t)win_lo; + it->dense_item_end = (it->reverse ? (int32_t)off_lo : + (int32_t)off_hi) - (int32_t)win_lo; + } else { + int found; + uint32_t pos; + uint16_t *offsets = s->layout.sparse.offsets; + + it->dense = 0; + it->sparse_offsets = offsets; + it->sparse_values = s->layout.sparse.values; + it->sparse_count = (int32_t)s->count; + if (it->reverse) { + /* Start from the last sparse entry that can still be in range. */ + pos = arSparseFindPos(s, (uint16_t)off_hi, &found); + it->sparse_pos = found ? (int32_t)pos : (int32_t)pos - 1; + + /* No sparse entry falls inside the requested offsets. */ + if (it->sparse_pos < 0 || offsets[it->sparse_pos] < off_lo) + return 0; + } else { + /* Start from the first sparse entry that can still be in range. */ + pos = arSparseFindPos(s, (uint16_t)off_lo, &found); + it->sparse_pos = (int32_t)pos; + + /* No sparse entry falls inside the requested offsets. */ + if (it->sparse_pos >= (int32_t)s->count || + offsets[it->sparse_pos] > off_hi) return 0; + } + } + + it->slice = s; + it->slice_base = slice_base; + it->off_lo = off_lo; + it->off_hi = off_hi; + it->slice_ready = 1; + return 1; +} + +/* Advance top-level directory state until a non-NULL slice in range is ready + * for local scanning, or return 0 if the iterator is exhausted. */ +static ALWAYS_INLINE int arScanIterLoadNextSlice(arScanIter *it) { + redisArray *ar = it->ar; + + if (ar->superdir) { + while (!it->top_done) { + /* No more superdir blocks to inspect. */ + if (it->sdir_index < 0 || it->sdir_index >= (int32_t)ar->sdir_len) { + it->top_done = 1; + break; + } + + arSDirEntry *e = ar->superdir + it->sdir_index; + uint64_t block_base = e->block_id * AR_SUPER_BLOCK_SLOTS; + uint64_t block_end = block_base + AR_SUPER_BLOCK_SLOTS - 1; + int32_t block_slot_lo = (block_base < it->lo_slice) ? + (int32_t)(it->lo_slice - block_base) : 0; + int32_t block_slot_hi = (block_end > it->hi_slice) ? + (int32_t)(it->hi_slice - block_base) : AR_SUPER_BLOCK_SLOTS - 1; + + /* This block starts after the requested range. */ + if (block_base > it->hi_slice) { + it->top_done = 1; + break; + } + + /* This block ends before the requested range. */ + if (block_end < it->lo_slice) { + if (it->reverse) it->top_done = 1; + else it->sdir_index++; + it->slot_index = AR_SCAN_ITER_SLOT_UNSET; + continue; + } + + if (it->reverse) { + /* slot_index uses a sentinel outside the valid 0..2047 range + * so reverse scans can consume slot 0 and then fall below the + * block without looking like a fresh block entry. */ + if (it->slot_index == AR_SCAN_ITER_SLOT_UNSET) + it->slot_index = block_slot_hi; + + while (it->slot_index >= block_slot_lo) { + int32_t si = it->slot_index--; + arSlice *s = e->slots[si]; + if (s && arScanIterPrepareSlice(it, s, block_base + si)) + return 1; + } + + /* This block had no more matching slices, move to the previous block. */ + it->sdir_index--; + it->slot_index = AR_SCAN_ITER_SLOT_UNSET; + } else { + /* slot_index uses a sentinel outside the valid 0..2047 range + * so an exhausted block does not look like a fresh entry. */ + if (it->slot_index == AR_SCAN_ITER_SLOT_UNSET) + it->slot_index = block_slot_lo; + + while (it->slot_index <= block_slot_hi) { + int32_t si = it->slot_index++; + arSlice *s = e->slots[si]; + if (s && arScanIterPrepareSlice(it, s, block_base + si)) + return 1; + } + + /* This block had no more matching slices, move to the next block. */ + it->sdir_index++; + it->slot_index = AR_SCAN_ITER_SLOT_UNSET; + } + } + } else { + while (!it->top_done) { + uint64_t slice_id = it->slice_id; + arSlice *s = ar->dir[slice_id]; + + /* Advance the top-level cursor before possibly returning this slice. */ + if (it->reverse) { + if (slice_id == it->lo_slice) it->top_done = 1; + else it->slice_id = slice_id - 1; + } else { + if (slice_id == it->hi_slice) it->top_done = 1; + else it->slice_id = slice_id + 1; + } + + if (s && arScanIterPrepareSlice(it, s, slice_id)) + return 1; + } + } + + return 0; +} + +/* Return the next populated element in range, or 0 when done. */ +static ALWAYS_INLINE int arScanIterNext(arScanIter *it, + uint64_t *idx, void **value) +{ + /* The iterator was already fully consumed. */ + if (it->done) return 0; + + while (1) { + if (it->slice_ready) { + /* Drain the current slice before asking for another one. */ + if (it->dense) { + while ((it->step > 0 && it->dense_item_pos <= it->dense_item_end) || + (it->step < 0 && it->dense_item_pos >= it->dense_item_end)) { + uint32_t off = (uint32_t)it->dense_off; + void *v = it->dense_items[it->dense_item_pos]; + it->dense_off += it->step; + it->dense_item_pos += it->step; + + /* Dense windows may contain holes. */ + if (arIsEmpty(v)) continue; + + if (idx) *idx = it->slice_base + off; + *value = v; + return 1; + } + } else { + while (it->sparse_pos >= 0 && it->sparse_pos < it->sparse_count) { + int32_t pos = it->sparse_pos; + uint32_t off = it->sparse_offsets[pos]; + + /* Sparse entries are sorted, so leaving the window ends this slice. */ + if (off < it->off_lo || off > it->off_hi) break; + + it->sparse_pos += it->step; + if (idx) *idx = it->slice_base + off; + *value = it->sparse_values[pos]; + return 1; + } + } + + /* The current slice has no more in-range populated elements. */ + it->slice = NULL; + it->slice_ready = 0; + } + + /* No more in-range slices are available. */ + if (!arScanIterLoadNextSlice(it)) { + it->done = 1; + return 0; + } + } +} + +/* ARSCAN key start end [LIMIT count] + * + * Returns only existing elements as flat index/value pairs. + * + * Complexity is O(P), where P is visited positions in touched slices + * (dense scanned slots + sparse entries), with worst-case O(|end-start|+1) + * and typical case close to O(N), where N is the number of existing + * elements in range. This means that huge ranges are safe and will not + * block the server with a work bound to the span length. + * + * Unlike ARGETRANGE, holes are skipped rather than returned as NULLs. + * LIMIT caps the number of returned pairs. */ +void arscanCommand(client *c) { + uint64_t start, end; + if (arrayParseIndexOrReply(c, c->argv[2], &start) != C_OK) return; + if (arrayParseIndexOrReply(c, c->argv[3], &end) != C_OK) return; + + /* Parse optional LIMIT */ + uint64_t remaining = UINT64_MAX; + if (c->argc == 6) { + if (strcasecmp(c->argv[4]->ptr, "LIMIT") != 0) { + addReplyErrorObject(c, shared.syntaxerr); + return; + } + long long ll; + if (getLongLongFromObjectOrReply(c, c->argv[5], &ll, NULL) != C_OK) + return; + if (ll <= 0) { + addReplyError(c, "LIMIT must be positive"); + return; + } + remaining = (uint64_t)ll; + } else if (c->argc != 4) { + addReplyErrorArity(c); + return; + } + + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o != NULL && checkType(c, o, OBJ_ARRAY)) return; + + if (o == NULL) { + addReplyArrayLen(c, 0); + return; + } + + redisArray *ar = o->ptr; + void *replylen = addReplyDeferredLen(c); + uint64_t count = 0; + arScanIter it; + uint64_t idx; + void *v; + + arScanIterInit(ar, start, end, &it); + while (remaining && arScanIterNext(&it, &idx, &v)) { + /* Reply with nested [idx, value] pairs. */ + addReplyArrayLen(c, 2); + addReplyUnsignedLongLong(c, idx); + addReplyArrayValue(c, v); + count++; + remaining--; + } + + setDeferredArrayLen(c, replylen, count); +} + +/* ============================================================================ + * ARGREP + * ============================================================================ + * + * Search existing array elements in a range using textual predicates. + * Like ARSCAN, the work is bound by the visited slices, not by the raw + * numeric span alone: dense slices scan the touched dense window, while + * sparse slices only scan stored entries inside the covered offsets. + * -------------------------------------------------------------------------- */ + +#define ARGREP_PRED_EXACT 1 +#define ARGREP_PRED_MATCH 2 +#define ARGREP_PRED_GLOB 3 +#define ARGREP_PRED_RE 4 + +#define ARGREP_MAX_PREDICATES 250 +#define ARGREP_MAX_RE_LEN 2048 + +#define ARGREP_COMBINE_OR 1 +#define ARGREP_COMBINE_AND 2 + +#define ARGREP_BOUND_INDEX 1 +#define ARGREP_BOUND_START 2 +#define ARGREP_BOUND_END 3 + +typedef struct { + int type; /* EXACT, MATCH, GLOB, or RE. */ + sds pattern; /* Pattern argument exactly as given by the user. */ + regex_t regex; /* Compiled regex for RE predicates. */ + int regex_compiled; /* Whether regex must be freed. */ +} arGrepPredicate; + +typedef struct { + int type; /* Numeric index, logical start, or logical end. */ + uint64_t index; /* Used only for numeric bounds. */ +} arGrepBound; + +typedef struct { + arGrepPredicate *preds; /* All predicates to apply to each element. */ + int num_preds; /* Number of predicates stored in preds[]. */ + int combine; /* OR by default, AND if requested. */ + int withvalues; /* Reply with [idx value ...] instead of [idx ...]. */ + int nocase; /* Apply case-insensitive matching globally. */ +} arGrepPlan; + +/* Lowercase only ASCII letters. This keeps MATCH/EXACT deterministic and + * locale-independent even on arbitrary binary payloads. */ +static inline unsigned char arGrepLowerAscii(unsigned char c) { + return (c >= 'A' && c <= 'Z') ? (unsigned char)(c + ('a' - 'A')) : c; +} + +/* Compare two byte strings, optionally ignoring ASCII case. */ +int arGrepBytesEqual(const char *a, size_t alen, const char *b, size_t blen, + int nocase) { + if (alen != blen) return 0; + if (!nocase) return memcmp(a, b, alen) == 0; + + for (size_t i = 0; i < alen; i++) { + if (arGrepLowerAscii((unsigned char)a[i]) != + arGrepLowerAscii((unsigned char)b[i])) { + return 0; + } + } + return 1; +} + +/* Find a needle inside a byte string, optionally ignoring ASCII case. */ +int arGrepBytesContains(const char *haystack, size_t haystack_len, + const char *needle, size_t needle_len, int nocase) { + if (needle_len == 0) return 1; + if (needle_len > haystack_len) return 0; + + size_t last = haystack_len - needle_len; + for (size_t i = 0; i <= last; i++) { + if (arGrepBytesEqual(haystack + i, needle_len, needle, needle_len, + nocase)) { + return 1; + } + } + return 0; +} + +/* Return the predicate type for a keyword, or 0 if it is not one. */ +int arGrepPredicateType(const char *token) { + if (!strcasecmp(token, "EXACT")) return ARGREP_PRED_EXACT; + if (!strcasecmp(token, "MATCH")) return ARGREP_PRED_MATCH; + if (!strcasecmp(token, "GLOB")) return ARGREP_PRED_GLOB; + if (!strcasecmp(token, "RE")) return ARGREP_PRED_RE; + return 0; +} + +/* Free any compiled regex state created while parsing ARGREP. */ +void arGrepFreePlan(arGrepPlan *plan) { + if (plan->preds == NULL) return; + + for (int i = 0; i < plan->num_preds; i++) { + if (plan->preds[i].regex_compiled) + tre_regfree(&plan->preds[i].regex); + } + zfree(plan->preds); + plan->preds = NULL; +} + +/* Parse a bound argument. ARGREP accepts the special tokens "-" and "+" + * in addition to normal array indexes. */ +int arGrepParseBoundOrReply(client *c, robj *arg, arGrepBound *bound) { + if (arg->encoding != OBJ_ENCODING_INT) { + sds token = arg->ptr; + if (sdslen(token) == 1 && token[0] == '-') { + bound->type = ARGREP_BOUND_START; + bound->index = 0; + return C_OK; + } + if (sdslen(token) == 1 && token[0] == '+') { + bound->type = ARGREP_BOUND_END; + bound->index = 0; + return C_OK; + } + } + + if (getArrayIndexFromObject(arg, &bound->index, 0) != C_OK) { + addReplyError(c, "invalid array index"); + return C_ERR; + } + bound->type = ARGREP_BOUND_INDEX; + return C_OK; +} + +/* Resolve a parsed bound against the current array length. */ +uint64_t arGrepResolveBound(arGrepBound *bound, uint64_t max_index) { + if (bound->type == ARGREP_BOUND_START) return 0; + if (bound->type == ARGREP_BOUND_END) return max_index; + return bound->index; +} + +/* Compile all RE predicates after the whole command is parsed, so NOCASE is + * already known and affects every regex consistently. */ +int arGrepCompileRegexesOrReply(client *c, arGrepPlan *plan) { + for (int i = 0; i < plan->num_preds; i++) { + arGrepPredicate *pred = &plan->preds[i]; + if (pred->type != ARGREP_PRED_RE) continue; + + if (sdslen(pred->pattern) == 0) { + addReplyError(c, "regular expression is empty"); + return C_ERR; + } + + int cflags = REG_EXTENDED | REG_NOSUB | REG_USEBYTES; + if (plan->nocase) cflags |= REG_ICASE; + + int err = tre_regncompb(&pred->regex, pred->pattern, + sdslen(pred->pattern), cflags); + if (err != REG_OK) { + char errbuf[256]; + tre_regerror(err, &pred->regex, errbuf, sizeof(errbuf)); + addReplyErrorFormat(c, "invalid regular expression: %s", errbuf); + return C_ERR; + } + pred->regex_compiled = 1; + + if (tre_have_backrefs(&pred->regex)) { + addReplyError(c, "regular expression backreferences are not supported"); + return C_ERR; + } + } + return C_OK; +} + +/* Parse predicates and global modifiers in a single pass. This makes the + * command more user-friendly because predicates and options can be mixed + * freely. If the same global option appears multiple times, the last one + * wins. */ +int arGrepParsePlanOrReply(client *c, arGrepPlan *plan, uint64_t *limit) { + memset(plan, 0, sizeof(*plan)); + plan->combine = ARGREP_COMBINE_OR; + *limit = UINT64_MAX; + + int max_preds = c->argc - 4; + plan->preds = zcalloc(sizeof(*plan->preds) * max_preds); + + for (int arg = 4; arg < c->argc; ) { + sds token = c->argv[arg]->ptr; + int type = arGrepPredicateType(token); + + if (type != 0) { + if (arg + 1 >= c->argc) { + addReplyErrorObject(c, shared.syntaxerr); + return C_ERR; + } + if (plan->num_preds >= ARGREP_MAX_PREDICATES) { + addReplyErrorFormat(c, "too many predicates, maximum is %d", + ARGREP_MAX_PREDICATES); + return C_ERR; + } + + arGrepPredicate *pred = &plan->preds[plan->num_preds++]; + pred->type = type; + pred->pattern = c->argv[arg + 1]->ptr; + if (type == ARGREP_PRED_RE && + sdslen(pred->pattern) > ARGREP_MAX_RE_LEN) { + addReplyErrorFormat(c, + "regular expression is too long, maximum is %d bytes", + ARGREP_MAX_RE_LEN); + return C_ERR; + } + arg += 2; + continue; + } + + if (!strcasecmp(token, "LIMIT")) { + if (arg + 1 >= c->argc) { + addReplyErrorObject(c, shared.syntaxerr); + return C_ERR; + } + + long long ll; + if (getLongLongFromObjectOrReply(c, c->argv[arg + 1], &ll, NULL) + != C_OK) { + return C_ERR; + } + if (ll <= 0) { + addReplyError(c, "LIMIT must be positive"); + return C_ERR; + } + + *limit = (uint64_t)ll; + arg += 2; + continue; + } + + if (!strcasecmp(token, "WITHVALUES")) { + plan->withvalues = 1; + arg++; + continue; + } + + if (!strcasecmp(token, "NOCASE")) { + plan->nocase = 1; + arg++; + continue; + } + + if (!strcasecmp(token, "AND") || !strcasecmp(token, "OR")) { + plan->combine = !strcasecmp(token, "AND") ? + ARGREP_COMBINE_AND : ARGREP_COMBINE_OR; + arg++; + continue; + } + + addReplyErrorObject(c, shared.syntaxerr); + return C_ERR; + } + + if (plan->num_preds == 0) { + addReplyErrorObject(c, shared.syntaxerr); + return C_ERR; + } + + return arGrepCompileRegexesOrReply(c, plan); +} + +/* Match one predicate against the decoded element bytes. */ +int arGrepMatchPredicate(arGrepPredicate *pred, const char *data, size_t len, + int nocase) { + size_t pattern_len = sdslen(pred->pattern); + + switch (pred->type) { + case ARGREP_PRED_EXACT: + return arGrepBytesEqual(data, len, pred->pattern, pattern_len, nocase); + case ARGREP_PRED_MATCH: + return arGrepBytesContains(data, len, pred->pattern, pattern_len, + nocase); + case ARGREP_PRED_GLOB: + return stringmatchlen(pred->pattern, pattern_len, data, len, nocase); + case ARGREP_PRED_RE: + return tre_regnexecb(&pred->regex, data, len, 0, NULL, 0) == REG_OK; + default: + serverPanic("Unknown ARGREP predicate type"); + } +} + +/* Decode one array value and apply all the predicates to it. */ +int arGrepValueMatches(arGrepPlan *plan, void *v) { + char buf[AR_INLINE_BUFSIZE]; + size_t len; + const char *data = arDecode(v, buf, sizeof(buf), &len); + + if (plan->combine == ARGREP_COMBINE_AND) { + for (int i = 0; i < plan->num_preds; i++) { + if (!arGrepMatchPredicate(&plan->preds[i], data, len, + plan->nocase)) { + return 0; + } + } + return 1; + } + + for (int i = 0; i < plan->num_preds; i++) { + if (arGrepMatchPredicate(&plan->preds[i], data, len, plan->nocase)) + return 1; + } + return 0; +} + +/* ARGREP key start end + * (EXACT string | MATCH string | GLOB pattern | RE pattern) ... + * [AND | OR] [LIMIT count] [WITHVALUES] [NOCASE] + * + * Search existing elements in a range and return matching indexes. + * + * Complexity is O(P * C), where P is the number of visited positions in the + * touched slices and C is the cost of evaluating the active predicates. + * Dense slices scan the touched dense window, sparse slices only visit stored + * entries, and LIMIT stops as soon as enough matches were emitted. + * + * "-" and "+" mean the logical start and end of the array. WITHVALUES changes + * the reply from [idx ...] to [idx value ...]. */ +void argrepCommand(client *c) { + arGrepBound start_bound, end_bound; + if (arGrepParseBoundOrReply(c, c->argv[2], &start_bound) != C_OK) return; + if (arGrepParseBoundOrReply(c, c->argv[3], &end_bound) != C_OK) return; + + arGrepPlan plan; + uint64_t remaining; + if (arGrepParsePlanOrReply(c, &plan, &remaining) != C_OK) { + arGrepFreePlan(&plan); + return; + } + + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o != NULL && checkType(c, o, OBJ_ARRAY)) { + arGrepFreePlan(&plan); + return; + } + if (o == NULL) { + arGrepFreePlan(&plan); + addReplyArrayLen(c, 0); + return; + } + + redisArray *ar = o->ptr; + uint64_t ar_len = arLen(ar); + if (ar_len == 0 || arCount(ar) == 0) { + arGrepFreePlan(&plan); + addReplyArrayLen(c, 0); + return; + } + + void *replylen = addReplyDeferredLen(c); + uint64_t count = 0; + uint64_t max_index = ar_len - 1; + uint64_t start = arGrepResolveBound(&start_bound, max_index); + uint64_t end = arGrepResolveBound(&end_bound, max_index); + arScanIter it; + uint64_t idx; + void *v; + + arScanIterInit(ar, start, end, &it); + while (remaining && arScanIterNext(&it, &idx, &v)) { + if (!arGrepValueMatches(&plan, v)) continue; + /* With WITHVALUES, reply nested [idx, value] pairs. */ + if (plan.withvalues) addReplyArrayLen(c, 2); + addReplyUnsignedLongLong(c, idx); + if (plan.withvalues) addReplyArrayValue(c, v); + count++; + remaining--; + } + + setDeferredArrayLen(c, replylen, count); + arGrepFreePlan(&plan); +} + +/* ============================================================================ + * AROP + * ============================================================================ + * + * Aggregate operations over a range. Uses O(N) iteration where N is the + * number of stored elements. Dense slices scan the window intersection + * (bounded by dense.winsize, kept small by demotion when density drops). + * -------------------------------------------------------------------------- */ + +/* Operation types for AROP */ +#define AROP_SUM 1 /* Sum of numeric elements in range. */ +#define AROP_MIN 2 /* Minimum numeric element in range. */ +#define AROP_MAX 3 /* Maximum numeric element in range. */ +#define AROP_AND 4 /* Bitwise AND of integer elements in range. */ +#define AROP_OR 5 /* Bitwise OR of integer elements in range. */ +#define AROP_XOR 6 /* Bitwise XOR of integer elements in range. */ +#define AROP_MATCH 7 /* Count elements equal to a target string. */ +#define AROP_USED 8 /* Count of non-empty (used) slots in range. */ + +/* Accumulator state for AROP */ +typedef struct { + int op; /* Selected AROP operation. */ + sds match_val; /* MATCH target string. */ + long double sum_acc; /* Running SUM accumulator. */ + long double minmax_acc; /* Running MIN or MAX accumulator. */ + int64_t bitwise_acc; /* Running AND/OR/XOR accumulator. */ + long long match_count; /* Number of MATCH hits. */ + long long used_count; /* Number of non-empty elements seen. */ + int has_numeric; /* Saw at least one numeric value. */ + int has_int; /* Saw at least one bitwise-usable integer. */ +} arOpAcc; + +/* Process a single value for AROP aggregation, aggregating it + * into the structure arOpAcc 'acc'. This helper is used + * directly by the AROP command implementation while scanning + * populated elements in the requested range. */ +static inline void arOpAccumulate(arOpAcc *acc, void *v) { + if (acc->op == AROP_USED) { + acc->used_count++; + return; + } + + if (acc->op == AROP_MATCH) { + size_t vlen; + char vbuf[AR_INLINE_BUFSIZE]; + const char *data = arDecode(v, vbuf, sizeof(vbuf), &vlen); + if (vlen == sdslen(acc->match_val) && + memcmp(data, acc->match_val, vlen) == 0) { + acc->match_count++; + } + return; + } + + /* Numeric operations */ + long double num; + int is_int = 0; + int64_t ival = 0; + + if (arIsInt(v)) { + ival = arToInt(v); + num = (long double)ival; + is_int = 1; + } else if (arIsFloat(v)) { + num = (long double)arToDouble(v); + } else { + const char *data; + size_t vlen; + char smallbuf[8]; + + if (arIsSmallStr(v)) { + vlen = arToSmallStr(v, smallbuf); + data = smallbuf; + } else { + data = arStringData(v); + vlen = arStringLen(v); + } + + long long ll; + if (string2ll(data, vlen, &ll)) { + ival = ll; + num = (long double)ll; + is_int = 1; + } else { + long double ld; + if (string2ld(data, vlen, &ld)) { + num = ld; + } else { + return; + } + } + } + + if (acc->op == AROP_AND || acc->op == AROP_OR || acc->op == AROP_XOR) { + if (!is_int) { + /* If it is a float, we only take the integer part. */ + if (isnan(num)) return; + if (num < (long double)INT64_MIN || num > (long double)INT64_MAX) + return; + ival = (int64_t)num; /* Truncate toward zero. */ + } + if (!acc->has_int) { + acc->bitwise_acc = ival; + acc->has_int = 1; + } else { + if (acc->op == AROP_AND) acc->bitwise_acc &= ival; + else if (acc->op == AROP_OR) acc->bitwise_acc |= ival; + else acc->bitwise_acc ^= ival; + } + } else { + if (!acc->has_numeric) { + /* Handle the first element seen for SUM, MIN, MAX. */ + acc->sum_acc = num; + acc->minmax_acc = num; + acc->has_numeric = 1; + } else { + if (acc->op == AROP_SUM) + acc->sum_acc += num; + else if (acc->op == AROP_MIN && num < acc->minmax_acc) + acc->minmax_acc = num; + else if (acc->op == AROP_MAX && num > acc->minmax_acc) + acc->minmax_acc = num; + } + } +} + +/* AROP key start end OP [arg] + * + * Aggregates over existing elements in the requested range, the + * aggregation performed depends in the "op" argument. + * + * Complexity is O(P), where P is visited positions in touched slices + * (dense scanned slots + sparse entries), with worst-case O(|end-start|+1) + * and typical case close to O(N), where N is the number of existing + * elements in range. + * + * MATCH and USED count hits. SUM/MIN/MAX ignore values that are not numeric. + * AND/OR/XOR truncate floats toward zero and ignore values that, after the + * truncation, cannot be represented as int64_t. */ +void aropCommand(client *c) { + uint64_t start, end; + if (arrayParseIndexOrReply(c, c->argv[2], &start) != C_OK) return; + if (arrayParseIndexOrReply(c, c->argv[3], &end) != C_OK) return; + + const char *opstr = c->argv[4]->ptr; + int op = 0; + if (!strcasecmp(opstr, "SUM")) op = AROP_SUM; + else if (!strcasecmp(opstr, "MIN")) op = AROP_MIN; + else if (!strcasecmp(opstr, "MAX")) op = AROP_MAX; + else if (!strcasecmp(opstr, "AND")) op = AROP_AND; + else if (!strcasecmp(opstr, "OR")) op = AROP_OR; + else if (!strcasecmp(opstr, "XOR")) op = AROP_XOR; + else if (!strcasecmp(opstr, "MATCH")) op = AROP_MATCH; + else if (!strcasecmp(opstr, "USED")) op = AROP_USED; + else { + addReplyError(c, "unknown operation"); + return; + } + + sds match_val = NULL; + if (op == AROP_MATCH) { + if (c->argc != 6) { + addReplyError(c, "MATCH requires a value argument"); + return; + } + match_val = c->argv[5]->ptr; + } else if (c->argc != 5) { + addReplyErrorArity(c); + return; + } + + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o == NULL) { + if (op == AROP_MATCH || op == AROP_USED) { + addReplyLongLong(c, 0); + } else { + addReplyNull(c); + } + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + arOpAcc acc = { + .op = op, .match_val = match_val, + .sum_acc = 0, .minmax_acc = 0, .bitwise_acc = 0, + .match_count = 0, .used_count = 0, + .has_numeric = 0, .has_int = 0 + }; + arScanIter it; + void *v; + + /* All current AROP operations are order-independent, so iterating the + * user-provided direction is fine here. */ + arScanIterInit(ar, start, end, &it); + while (arScanIterNext(&it, NULL, &v)) + arOpAccumulate(&acc, v); + + /* Reply */ + if (op == AROP_MATCH) { + addReplyLongLong(c, acc.match_count); + } else if (op == AROP_USED) { + addReplyLongLong(c, acc.used_count); + } else if (op == AROP_AND || op == AROP_OR || op == AROP_XOR) { + if (!acc.has_int) addReplyNull(c); + else addReplyLongLong(c, acc.bitwise_acc); + } else { + if (!acc.has_numeric) { + addReplyNull(c); + } else { + long double result = (op == AROP_SUM) ? acc.sum_acc : acc.minmax_acc; + char buf[MAX_LONG_DOUBLE_CHARS + 1]; + int len = ld2string(buf, sizeof(buf), result, LD_STR_AUTO); + addReplyBulkCBuffer(c, buf, len); + } + } +} + +/* ---------------------------------------------------------------------------- + * The ring buffer family of commands: + * + * ARINSERT / ARNEXT / ARSEEK / ARLASTITEMS + * -------------------------------------------------------------------------- */ + +/* ARINSERT key value [value ...] + * + * Appends one or more values at the private insert cursor in O(N), where N is + * the number of values. The whole batch fails on index overflow. + * + * The cursor is then advanced to the last written index, which is also + * returned as the command return value, and can be inspected later + * with ARNEXT. */ +void arinsertCommand(client *c) { + robj *o = lookupArrayForWriteOrReply(c, c->argv[1]); + if (o == NULL) return; + + redisArray *ar = o->ptr; + uint64_t old_count = arCount(ar); + size_t old_alloc = 0; + if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); + int num_values = c->argc - 2; + + /* Pre-validate: compute start cursor and check entire batch fits */ + uint64_t start_cursor; + if (ar->insert_idx == AR_INSERT_IDX_NONE) { + start_cursor = 0; + } else { + if (ar->insert_idx >= UINT64_MAX - 1) { + addReplyError(c, "insert index overflow"); + return; + } + start_cursor = ar->insert_idx + 1; + } + + /* Check last cursor won't overflow or reach forbidden index. */ + uint64_t last_cursor = start_cursor + (uint64_t)num_values - 1; + if (last_cursor < start_cursor || last_cursor == UINT64_MAX) { + addReplyError(c, "insert index overflow"); + return; + } + + /* Pre-promote sparse slices only for true bulk inserts. A single-element + * insert does not benefit from the extra range-analysis pass. */ + if (num_values > 1) + arMayPromoteToDenseForRangeSet(ar, start_cursor, last_cursor); + + /* Apply all values */ + uint64_t cursor = start_cursor; + for (int i = 0; i < num_values; i++) { + sds val = c->argv[2 + i]->ptr; + void *v = arEncode(val, sdslen(val)); + arSet(ar, cursor, v); + cursor++; + } + ar->insert_idx = last_cursor; + + updateKeysizesHist(c->db, OBJ_ARRAY, old_count, arCount(ar)); + if (server.memory_tracking_enabled) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + keyModified(c, c->db, c->argv[1], o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "arinsert", c->argv[1], c->db->id); + server.dirty += num_values; + + addReplyUnsignedLongLong(c, ar->insert_idx); +} + +/* Duplicate one array value exactly. Immediate values can be copied as tagged + * words, while heap strings are re-encoded from their logical string form. + * This could be regarded as costly, but capturing values out of the existing + * array would break the sparsearray API isolation. */ +static void *arRingDupValue(void *v) { + if (v == NULL || !arIsPtr(v)) return v; + return arEncode(arStringData(v), arStringLen(v)); +} + +/* Return the next slot that ARRING would write to before modulo reduction. */ +static uint64_t arRingNextCursor(redisArray *ar) { + return (ar->insert_idx == AR_INSERT_IDX_NONE) ? 0 : ar->insert_idx + 1; +} + +/* Decide if ARRING needs to rebuild the retained logical ring positions before + * writing new values. + * + * We rebuild in only two cases: + * + * 1. Shrink: new size is smaller than the current inferred ring span. + * 2. Grow after wrap: the ring had already wrapped inside the old span, so + * without a rebuild the next write would overwrite old low indexes instead + * of using the newly added capacity. + * + * An explicit ARSEEK 0 is treated differently on grow: it is a direct cursor + * override saying "write next at index 0", so we honor it instead of forcing + * a grow-after-wrap repack first. + * + * keep_span is the maximum number of logical positions that may be retained. */ +static int arRingNeedsRework(redisArray *ar, uint64_t ring_size, + uint64_t *old_span, uint64_t *keep_span) { + *old_span = arLen(ar); + *keep_span = 0; + + if (*old_span == 0) return 0; + + if (ring_size < *old_span) { + *keep_span = ring_size; + return 1; + } + if (ring_size == *old_span) { + return 0; + } + if (ar->insert_idx == AR_INSERT_IDX_NONE) { + return 0; + } + if (arRingNextCursor(ar) < *old_span) { + *keep_span = *old_span; + return 1; + } + return 0; +} + +/* Rebuild the retained logical ring positions into a fresh compact array. + * + * We walk backward from the current anchor and keep at most keep_span items, + * but stop as soon as the first NULL is encountered. This makes resize keep + * the latest contiguous tail of existing items instead of crossing holes. + * + * The retained items are replayed in chronological order, oldest to newest, + * so after the rebuild: + * + * - index 0 holds the oldest retained position + * - index retained_count-1 holds the newest retained position + * - insert_idx points to retained_count-1, ready for the next ARRING write + * + * We use two passes: one backward pass to count the contiguous retained tail, + * then one forward replay pass into the new array. This avoids any temporary + * retained-items buffer. */ +static redisArray *arRingRework(redisArray *ar, uint64_t old_span, + uint64_t keep_span) { + serverAssert(old_span > 0); + serverAssert(keep_span > 0); + serverAssert(keep_span <= old_span); + + redisArray *new_ar = arNew(); + + /* The rebuild operates on the inferred ring window [0..old_span-1]. If + * insert_idx is outside that window because of ARSEEK, fold it back into + * the current inferred span with modulo. If ARSEEK 0 was used and we are + * shrinking, anchor the walk at the current tail, just like ARLASTITEMS. + * Grow does not reach this path because arRingNeedsRework() skips grow + * rework when insert_idx is AR_INSERT_IDX_NONE. */ + uint64_t anchor_idx = (ar->insert_idx == AR_INSERT_IDX_NONE) ? + (old_span - 1) : (ar->insert_idx % old_span); + + uint64_t retained_count = 0; + uint64_t src_idx = anchor_idx; + + while (retained_count < keep_span) { + void *v = arGet(ar, src_idx); + if (v == NULL) break; /* This makes any mix of ARSET/SEEK/RING calls + * always bound to populatede items, not logical + * array span. */ + + retained_count++; + src_idx = (src_idx == 0) ? old_span - 1 : src_idx - 1; + } + + /* src_idx now points to the position just before the oldest retained + * item, so advance once to start replaying oldest -> newest. */ + src_idx++; + if (src_idx == old_span) src_idx = 0; + + for (uint64_t dst_idx = 0; dst_idx < retained_count; dst_idx++) { + void *v = arGet(ar, src_idx); + serverAssert(v != NULL); + arSet(new_ar, dst_idx, arRingDupValue(v)); + + src_idx++; + if (src_idx == old_span) src_idx = 0; + } + if (retained_count != 0) new_ar->insert_idx = retained_count - 1; + return new_ar; +} + +/* ARRING key size value [value ...] + * + * Writes values into a logical ring buffer. May rework the array if + * the logical size changes across calls, so that the up to size + * items are retained in the correct logical position. + * + * Complexity is O(M) normally, where M is the number of inserted values, + * and O(N+M) on resize, where N is the maximum of the old and new ring size. + * The rebuild stops at the first NULL, so holes cut the retained tail. + * + * ARSEEK 0 is still honored as a direct cursor override on grow. + * + * Returns the last written slot. */ +void arringCommand(client *c) { + long long ll; + if (getLongLongFromObjectOrReply(c,c->argv[2],&ll,"invalid size") != C_OK) + return; + if (ll <= 0) { + addReplyError(c, "size must be positive"); + return; + } + uint64_t ring_size = (uint64_t)ll; + + robj *o = lookupArrayForWriteOrReply(c, c->argv[1]); + if (o == NULL) return; + + redisArray *ar = o->ptr; + uint64_t old_count = arCount(ar); + size_t old_alloc = 0; + if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); + int num_values = c->argc - 3; + uint64_t cursor = 0; + + /* If the requested size changes the logical ring shape, rebuild once + * before the hot insertion loop. This makes the command, when the user + * updates the window, no longer O(M), but O(N+M), however note that this + * is absolutely needed for high level sane semantics. Users will resize + * ring buffers, and they want to retain the latest items in a logically + * correct way. */ + uint64_t old_span, keep_span; + if (arRingNeedsRework(ar, ring_size, &old_span, &keep_span)) { + redisArray *new_ar = arRingRework(ar, old_span, keep_span); + arFree(ar); + o->ptr = ar = new_ar; + } + + /* Set the new items, modulo ring size. */ + for (int i = 0; i < num_values; i++) { + /* Compute the next write position, then wrap it into the requested + * ring size if needed. By this point any needed resize/rework was + * already handled above. */ + cursor = arRingNextCursor(ar); + if (cursor >= ring_size) cursor = cursor % ring_size; + + /* Set the value */ + sds val = c->argv[3 + i]->ptr; + void *v = arEncode(val, sdslen(val)); + arSet(ar, cursor, v); + ar->insert_idx = cursor; + } + + updateKeysizesHist(c->db, OBJ_ARRAY, old_count, arCount(ar)); + if (server.memory_tracking_enabled) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), o, old_alloc, kvobjAllocSize(o)); + keyModified(c, c->db, c->argv[1], o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "arring", c->argv[1], c->db->id); + server.dirty += num_values; + + addReplyUnsignedLongLong(c, cursor); +} + +/* ARNEXT key + * + * Returns in O(1) the next index that ARINSERT / ARRING would use. + * + * Missing keys and the pre-insert state reply with 0. If the cursor is in the + * terminal state where the next append would overflow, the reply is NULL. */ +void arnextCommand(client *c) { + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o == NULL) { + addReplyLongLong(c, 0); + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + if (ar->insert_idx == AR_INSERT_IDX_NONE) { + addReplyLongLong(c, 0); + } else if (ar->insert_idx == UINT64_MAX - 1) { + addReplyNull(c); /* Terminal: index space exhausted */ + } else { + addReplyUnsignedLongLong(c, ar->insert_idx + 1); + } +} + +/* ARSEEK key idx + * + * Sets in O(1) the next index used by ARINSERT and ARRING. + * + * Returns 1 if the cursor was updated and 0 if the key does not exist. + * idx 0 resets the insert state to "next write goes to 0": in this case + * successive ARRING calls are guaranteed to don't rework the array in chase + * of logical size change. */ +void arseekCommand(client *c) { + uint64_t idx; + /* Allow UINT64_MAX because ARSEEK UINT64_MAX sets insert_idx to + * UINT64_MAX-1, which is a valid terminal state (next ARINSERT + * would overflow and fail). This is needed for AOF persistence. */ + if (getArrayIndexFromObject(c->argv[2], &idx, 1) != C_OK) { + addReplyError(c, "invalid array index"); + return; + } + + /* There aren't many good options for non existing keys: both creating + * an empty array or failing with "no such key" does not align very + * well with the Redis commands usual semantics. However we need to signal + * back that we ignored the index set if the key is not there, so zero + * is returned. */ + robj *o = lookupKeyWrite(c->db, c->argv[1]); + if (o == NULL) { + addReplyLongLong(c, 0); + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + + /* Set insert_idx so next ARINSERT writes to idx */ + if (idx == 0) { + ar->insert_idx = AR_INSERT_IDX_NONE; + } else { + ar->insert_idx = idx - 1; + } + + keyModified(c, c->db, c->argv[1], o, 1); + notifyKeyspaceEvent(NOTIFY_ARRAY, "arseek", c->argv[1], c->db->id); + server.dirty++; + addReplyLongLong(c, 1); +} + +/* ARLASTITEMS key count [REV] + * + * Returns the most recent positions from the current insert anchor in O(N), + * where N is the requested count. REV flips the reply order. + * + * This command may return NULLs because it walks positions, not only existing + * items. If ARSEEK 0 was used, the current array tail is used as the anchor. */ +void arlastitemsCommand(client *c) { + long long count; + if (getLongLongFromObjectOrReply(c, c->argv[2], &count, + "invalid COUNT") != C_OK) return; + + /* For count <= 0, nothing to return, just an empty array. */ + if (count <= 0) { + addReplyArrayLen(c, 0); + return; + } + + /* Parse REV if provided. */ + int rev = 0; + if (c->argc == 4) { + if (strcasecmp(c->argv[3]->ptr, "REV") == 0) { + rev = 1; + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } + } else if (c->argc != 3) { + addReplyErrorArity(c); + return; + } + + /* No key? Empty reply. */ + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o == NULL) { + addReplyArrayLen(c, 0); + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + uint64_t ar_len = arLen(ar); + uint64_t effective_count = + (uint64_t)count > ar->count ? ar->count : (uint64_t)count; + + /* Should never happen in practice, because we checked the COUNT before + * and the array should not be empty to be still a Redis key, so this + * is mostly a safety net. */ + if (effective_count == 0) { + addReplyArrayLen(c, 0); + return; + } + + /* Collect items walking backward from insert_idx. If ARSEEK 0 was used, + * insert_idx is AR_INSERT_IDX_NONE: in that case use the max set index as + * the anchor so ARLASTITEMS still reports the tail of the current array. + * + * Note that we use an array to collect the items: in the no-REV case + * otherwise a double scan would be needed. */ + void **collected = zmalloc(effective_count * sizeof(void *)); + uint64_t anchor_idx = + (ar->insert_idx == AR_INSERT_IDX_NONE) ? ar_len - 1 : ar->insert_idx; + uint64_t current_idx = anchor_idx; + uint64_t steps = 0; + + while(steps < effective_count) { + collected[steps] = arGet(ar, current_idx); + steps++; + + /* Decrement with wrap */ + if (current_idx == 0) { + current_idx = ar_len - 1; + } else { + current_idx--; + } + } + + /* Emit the protocol with the collected items. */ + addReplyArrayLen(c, steps); + if (rev) { + /* Return in reverse chronological order (newest first) */ + for (uint64_t i = 0; i < steps; i++) + addReplyArrayValue(c, collected[i]); + } else { + /* Return in chronological order (oldest first) */ + for (int64_t i = steps - 1; i >= 0; i--) + addReplyArrayValue(c, collected[i]); + } + zfree(collected); +} + +/* ---------------------------------------------------------------------------- + * ARINFO + * -------------------------------------------------------------------------- */ + +/* ARINFO key [FULL] + * + * Returns metadata about the array in O(1), or O(N) with FULL where N is the + * number of slices. Unlike ARLEN and ARCOUNT, a missing key is an error. + * FULL adds per-encoding slice statistics by scanning the directory. */ +void arinfoCommand(client *c) { + int full = 0; + + if (c->argc > 2) { + if (c->argc == 3 && !strcasecmp(c->argv[2]->ptr, "full")) { + full = 1; + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } + } + + robj *o = lookupKeyRead(c->db, c->argv[1]); + if (o == NULL) { + addReplyError(c, "no such key"); + return; + } + if (checkType(c, o, OBJ_ARRAY)) return; + + redisArray *ar = o->ptr; + + /* Per-encoding stats (only computed for FULL) */ + uint64_t num_dense = 0; + uint64_t num_sparse = 0; + uint64_t dense_total_winsize = 0; + uint64_t dense_total_count = 0; + uint64_t sparse_total_cap = 0; + + if (full) { + if (ar->superdir) { + for (uint32_t bi = 0; bi < ar->sdir_len; bi++) { + arSDirEntry *e = ar->superdir + bi; + for (uint32_t si = 0; si < AR_SUPER_BLOCK_SLOTS; si++) { + arSlice *s = e->slots[si]; + if (!s) continue; + if (s->encoding == AR_SLICE_DENSE) { + num_dense++; + dense_total_winsize += s->layout.dense.winsize; + dense_total_count += s->count; + } else { + num_sparse++; + sparse_total_cap += s->layout.sparse.cap; + } + } + } + } else { + for (uint64_t i = 0; i < ar->dir_alloc; i++) { + arSlice *s = ar->dir[i]; + if (!s) continue; + if (s->encoding == AR_SLICE_DENSE) { + num_dense++; + dense_total_winsize += s->layout.dense.winsize; + dense_total_count += s->count; + } else { + num_sparse++; + sparse_total_cap += s->layout.sparse.cap; + } + } + } + } + + if (full) { + addReplyMapLen(c, 12); + } else { + addReplyMapLen(c, 7); + } + + addReplyBulkCString(c, "count"); + addReplyUnsignedLongLong(c, ar->count); + + addReplyBulkCString(c, "len"); + addReplyUnsignedLongLong(c, arLen(ar)); + + addReplyBulkCString(c, "next-insert-index"); + if (ar->insert_idx == AR_INSERT_IDX_NONE || + ar->insert_idx == UINT64_MAX - 1) { + addReplyLongLong(c, 0); + } else { + addReplyUnsignedLongLong(c, ar->insert_idx + 1); + } + + addReplyBulkCString(c, "slices"); + addReplyLongLong(c, ar->num_slices); + + addReplyBulkCString(c, "directory-size"); + if (ar->superdir) { + /* Superdir mode: report allocated capacity */ + addReplyLongLong(c, ar->sdir_cap); + } else { + addReplyLongLong(c, ar->dir_alloc); + } + + addReplyBulkCString(c, "super-dir-entries"); + addReplyLongLong(c, ar->superdir ? ar->sdir_len : 0); + + addReplyBulkCString(c, "slice-size"); + addReplyLongLong(c, ar->slice_size); + + if (full) { + addReplyBulkCString(c, "dense-slices"); + addReplyLongLong(c, num_dense); + + addReplyBulkCString(c, "sparse-slices"); + addReplyLongLong(c, num_sparse); + + addReplyBulkCString(c, "avg-dense-size"); + if (num_dense > 0) { + addReplyDouble(c, (double)dense_total_winsize / num_dense); + } else { + addReplyDouble(c, 0); + } + + addReplyBulkCString(c, "avg-dense-fill"); + if (dense_total_winsize > 0) { + addReplyDouble(c, (double)dense_total_count / dense_total_winsize); + } else { + addReplyDouble(c, 0); + } + + addReplyBulkCString(c, "avg-sparse-size"); + if (num_sparse > 0) { + addReplyDouble(c, (double)sparse_total_cap / num_sparse); + } else { + addReplyDouble(c, 0); + } + } +} diff --git a/src/util.h b/src/util.h index 056ffdcf6..0c775c205 100644 --- a/src/util.h +++ b/src/util.h @@ -91,6 +91,12 @@ static inline int log2ceil(size_t x) { #endif } +/* Return the smallest power of 2 >= count (e.g. 5 -> 8, 8 -> 8). */ +static inline int nearestNextPowerOf2(unsigned int count) { + if (count <= 1) return 1; + return 1 << (32 - __builtin_clz(count-1)); +} + /* Check for __builtin_add_overflow() */ #ifndef __has_builtin #define __has_builtin(x) 0 diff --git a/tests/assets/array-32bit.rdb b/tests/assets/array-32bit.rdb new file mode 100644 index 0000000000000000000000000000000000000000..2e997c968e9d4ea2e8239f02e097f13329b0a4c5 GIT binary patch literal 808 zcmX|`E7E{K>!!?*Zp?%UOdh;H% zAxVvXraZ{6Ox)Al=#HKFoSV@{aRkO~;5V}dXilT-ms3>hI9)Hpq4Qn;8N5l5wCktvIc z+eknaDrj8}ylIe%ewIEhi(b#N^eo{#bJt)g(KTQx7r)HhoLISYW=r$+bhj>@Uru)$ zD9TdhaI_Jfl1K`=IRq6l$~F%v-fX_nGSI-PT1ubkn>s2vFXu+~K+NhMV3o>U1a|qL bvMSc=lR`* 0 + } else { + after 120 ;# serverCron only updates the info once in 100ms + puts [r info memory] + puts [r info stats] + puts [r memory malloc-stats] + fail "array defrag did not touch the key." + } + + r config set activedefrag no + wait_for_defrag_stop 500 100 + } + + # Verify the array stayed intact after active defrag touched it. + assert_equal $elements [r arcount bigarray1] + assert_equal "a1:0:$payload" [r arget bigarray1 $base] + assert_equal "a1:1234:$payload" [r arget bigarray1 [expr {$base + 1234 * 4096}]] + assert_equal "a1:2999:$payload" [r arget bigarray1 [expr {$base + 2999 * 4096}]] + assert_equal $digest [debug_digest] + assert_equal OK [r save] ;# Iterates all pointers again after defrag. + expr 1 + } {1} + } } test "Active defrag can't be triggered during replicaof database flush. See issue #14267" { diff --git a/tests/unit/type/array.tcl b/tests/unit/type/array.tcl new file mode 100644 index 000000000..d0f62fe3e --- /dev/null +++ b/tests/unit/type/array.tcl @@ -0,0 +1,3114 @@ +start_server { + tags {"array"} +} { + # Basic ARSET/ARGET tests + test {ARSET and ARGET basics} { + r del myarray + assert_equal 1 [r arset myarray 0 hello] + assert_equal hello [r arget myarray 0] + assert_equal {} [r arget myarray 1] + } + + test {ARSET overwrites existing value} { + r del myarray + assert_equal 1 [r arset myarray 0 hello] + assert_equal 0 [r arset myarray 0 world] + assert_equal world [r arget myarray 0] + } + + test {ARGET non-existing key} { + r del myarray + assert_equal {} [r arget myarray 0] + } + + test {ARGET validates index even on non-existing key} { + r del myarray + assert_error {*invalid array index*} {r arget myarray not-an-index} + } + + test {ARSET/ARGET with integer values} { + r del myarray + r arset myarray 0 12345 + assert_equal 12345 [r arget myarray 0] + } + + test {ARSET/ARGET with float values} { + r del myarray + r arset myarray 0 3.14159 + assert_equal 3.14159 [r arget myarray 0] + } + + test {ARSET/ARGET with small strings} { + r del myarray + r arset myarray 0 abc + assert_equal abc [r arget myarray 0] + } + + test {ARSET/ARGET with large string} { + r del myarray + set longstr [string repeat x 100] + r arset myarray 0 $longstr + assert_equal $longstr [r arget myarray 0] + } + + test {ARSET/ARGET with empty string} { + r del myarray + r arset myarray 0 "" + assert_equal "" [r arget myarray 0] + } + + # ARLEN and ARCOUNT tests + test {ARLEN and ARCOUNT basics} { + r del myarray + assert_equal 0 [r arlen myarray] + assert_equal 0 [r arcount myarray] + + r arset myarray 0 a + assert_equal 1 [r arlen myarray] + assert_equal 1 [r arcount myarray] + + r arset myarray 5 b + assert_equal 6 [r arlen myarray] + assert_equal 2 [r arcount myarray] + + r arset myarray 100 c + assert_equal 101 [r arlen myarray] + assert_equal 3 [r arcount myarray] + } + + # ARDEL tests + test {ARDEL basics} { + r del myarray + r arset myarray 0 a + r arset myarray 1 b + r arset myarray 2 c + + assert_equal 1 [r ardel myarray 1] + assert_equal {} [r arget myarray 1] + assert_equal 2 [r arcount myarray] + + # Delete non-existing index returns 0 + assert_equal 0 [r ardel myarray 1] + } + + test {ARDEL multiple indices} { + r del myarray + r arset myarray 0 a + r arset myarray 1 b + r arset myarray 2 c + r arset myarray 3 d + + assert_equal 3 [r ardel myarray 0 1 2] + assert_equal 1 [r arcount myarray] + } + + test {ARDEL last element deletes key} { + r del myarray + r arset myarray 0 a + r ardel myarray 0 + assert_equal 0 [r exists myarray] + } + + test {ARDEL notifies array event before del when key is removed} { + set orig_notify [lindex [r config get notify-keyspace-events] 1] + r config set notify-keyspace-events KEA + r del myarray + r arset myarray 0 a + + set rd1 [redis_deferring_client] + assert_equal {1} [psubscribe $rd1 *] + assert_equal 1 [r ardel myarray 0] + + assert_match "pmessage * __keyspace@*__:myarray ardel" [$rd1 read] + assert_match "pmessage * __keyevent@*__:ardel myarray" [$rd1 read] + assert_match "pmessage * __keyspace@*__:myarray del" [$rd1 read] + assert_match "pmessage * __keyevent@*__:del myarray" [$rd1 read] + + $rd1 close + r config set notify-keyspace-events $orig_notify + } + + # ARDELRANGE tests + test {ARDELRANGE basics} { + r del myarray + for {set i 0} {$i < 10} {incr i} { + r arset myarray $i [expr $i * 10] + } + assert_equal 10 [r arcount myarray] + + assert_equal 5 [r ardelrange myarray 2 6] + assert_equal 5 [r arcount myarray] + } + + test {ARDELRANGE reverse order} { + r del myarray + for {set i 0} {$i < 10} {incr i} { + r arset myarray $i [expr $i * 10] + } + + assert_equal 5 [r ardelrange myarray 6 2] + assert_equal 5 [r arcount myarray] + } + + test {ARDELRANGE notifies array event before del when key is removed} { + set orig_notify [lindex [r config get notify-keyspace-events] 1] + r config set notify-keyspace-events KEA + r del myarray + assert_equal 3 [r arset myarray 0 a b c] + + set rd1 [redis_deferring_client] + assert_equal {1} [psubscribe $rd1 *] + assert_equal 3 [r ardelrange myarray 0 2] + + assert_match "pmessage * __keyspace@*__:myarray ardelrange" [$rd1 read] + assert_match "pmessage * __keyevent@*__:ardelrange myarray" [$rd1 read] + assert_match "pmessage * __keyspace@*__:myarray del" [$rd1 read] + assert_match "pmessage * __keyevent@*__:del myarray" [$rd1 read] + + $rd1 close + r config set notify-keyspace-events $orig_notify + } + + # ARMSET and ARMGET tests + test {ARMSET basics} { + r del myarray + assert_equal 3 [r armset myarray 0 a 1 b 2 c] + assert_equal a [r arget myarray 0] + assert_equal b [r arget myarray 1] + assert_equal c [r arget myarray 2] + } + + test {ARMSET returns only newly filled slots} { + r del myarray + r arset myarray 0 a + assert_equal 1 [r armset myarray 0 aa 1 b] + assert_equal aa [r arget myarray 0] + assert_equal b [r arget myarray 1] + } + + test {ARMGET basics} { + r del myarray + r arset myarray 0 a + r arset myarray 1 b + r arset myarray 5 c + + set result [r armget myarray 0 1 5 3] + assert_equal a [lindex $result 0] + assert_equal b [lindex $result 1] + assert_equal c [lindex $result 2] + assert_equal {} [lindex $result 3] + } + + # ARGETRANGE and contiguous ARSET tests + test {ARGETRANGE basics} { + r del myarray + r armset myarray 0 a 1 b 2 c 3 d 4 e + + set result [r argetrange myarray 1 3] + assert_equal {b c d} $result + } + + test {ARGETRANGE reverse} { + r del myarray + r armset myarray 0 a 1 b 2 c 3 d 4 e + + set result [r argetrange myarray 3 1] + assert_equal {d c b} $result + } + + test {ARGETRANGE errors when requested range exceeds the hard limit} { + assert_error {*range exceeds maximum of 1000000 items*} {r argetrange myarray 0 1000000} + } + + test {ARGETRANGE reverse errors when requested range exceeds the hard limit} { + assert_error {*range exceeds maximum of 1000000 items*} {r argetrange myarray 1000000 0} + } + + # ARSCAN tests + test {ARSCAN returns only existing elements with indices} { + r del myarray + r arset myarray 0 a + r arset myarray 5 b + r arset myarray 9 c + + set result [r arscan myarray 0 10] + assert_equal {{0 a} {5 b} {9 c}} $result + } + + test {ARSCAN on empty range returns empty array} { + r del myarray + r arset myarray 500 x + + set result [r arscan myarray 0 100] + assert_equal {} $result + } + + test {ARSCAN reversed range} { + r del myarray + r arset myarray 0 a + r arset myarray 5 b + + set result [r arscan myarray 5 0] + assert_equal {{5 b} {0 a}} $result + } + + test {ARSCAN on non-existent key returns empty array} { + r del nokey + set result [r arscan nokey 0 100] + assert_equal {} $result + } + + test {ARSCAN with mixed value types} { + r del myarray + r arset myarray 0 string + r arset myarray 1 12345 + r arset myarray 2 3.14 + + set result [r arscan myarray 0 10] + assert_equal 3 [llength $result] + assert_equal {0 string} [lindex $result 0] + assert_equal {1 12345} [lindex $result 1] + assert_equal {2 3.14} [lindex $result 2] + } + + # ARGREP tests + test {ARGREP MATCH returns matching indexes} { + r del myarray + r armset myarray 0 alpha 1 beta 2 alphabet 5 gamma + + assert_equal {0 2} [r argrep myarray - + MATCH alpha] + } + + test {ARGREP supports WITHVALUES and reverse ranges} { + r del myarray + r armset myarray 0 alpha 1 beta 2 alphabet 3 delta + + assert_equal {{2 alphabet} {0 alpha}} \ + [r argrep myarray 3 0 MATCH alpha WITHVALUES] + } + + test {ARGREP supports AND, GLOB, and NOCASE} { + r del myarray + r armset myarray 0 RedisArray 1 redis-match 2 array-only 3 plain + + assert_equal {0} [r argrep myarray - + MATCH redis GLOB *array* AND NOCASE] + } + + test {ARGREP supports RE predicates} { + r del myarray + r armset myarray 0 foo123 1 bar 2 zoo999 3 Foo777 + + assert_equal {0 2 3} [r argrep myarray - + RE {^.*[0-9]{3}$}] + assert_equal {0 3} [r argrep myarray - + RE {^foo[0-9]+$} NOCASE] + } + + test {ARGREP RE literal alternation forms still match correctly} { + r del myarray + r armset myarray 0 foo 1 bar 2 baz 3 foobar 4 BAR 5 quxfoo 6 zedbar \ + 7 plain 8 ALPS 9 alphabet + + assert_equal {0 1 3 5 6} [r argrep myarray - + RE {foo|bar}] + assert_equal {0 1 3 4 5 6} [r argrep myarray - + RE {foo|bar} NOCASE] + assert_equal {0 1 4} [r argrep myarray - + RE {^(foo|bar)$} NOCASE] + assert_equal {0 1 3 4} [r argrep myarray - + RE {^(foo|bar)} NOCASE] + assert_equal {0 1 3 4 5 6} [r argrep myarray - + RE {(foo|bar)$} NOCASE] + assert_equal {8 9} [r argrep myarray - + RE {alpha|alps} NOCASE] + } + + test {ARGREP RE grouped alternation smoke test} { + r del myarray + r armset myarray 0 item-foo-123 1 ITEM-BAR-456 2 item-baz 3 plain + + assert_equal {0 1} \ + [r argrep myarray - + RE {^item-(foo|bar)-[0-9]{3}$} NOCASE] + } + + test {ARGREP enforces RE length and rejects backreferences} { + r del myarray + set re2048 [string repeat a 2048] + set re2049 [string repeat a 2049] + r arset myarray 0 $re2048 + + assert_equal {0} [r argrep myarray - + RE $re2048] + assert_error {*maximum is 2048 bytes*} {r argrep myarray - + RE $re2049} + assert_error {*backreferences are not supported*} {r argrep myarray - + RE {(a)\1}} + assert_error {*regular expression is empty*} {r argrep myarray - + RE {}} + } + + test {ARGREP LIMIT stops after enough matches} { + r del myarray + r armset myarray 0 hit-1 1 hit-2 2 miss 3 hit-3 + + assert_equal {0 1} [r argrep myarray - + MATCH hit LIMIT 2] + } + + test {ARGREP allows mixed predicate and option order, last wins} { + r del myarray + r armset myarray 0 RedisArray 1 redis-match 2 array-only 3 plain + + assert_equal {0} \ + [r argrep myarray - + OR MATCH redis LIMIT 3 GLOB *array* AND LIMIT 1 NOCASE] + } + + test {ARGREP enforces the predicate limit} { + r del myarray + r arset myarray 0 foo + + set cmd [list r argrep myarray - +] + for {set i 0} {$i < 250} {incr i} { + lappend cmd MATCH foo + } + assert_equal {0} [uplevel 1 $cmd] + + lappend cmd MATCH foo + assert_error {*maximum is 250*} [list uplevel 1 $cmd] + } + + test {ARGREP handles missing keys and syntax errors} { + r del nokey + assert_equal {} [r argrep nokey - + MATCH foo] + assert_error {*syntax error*} {r argrep myarray - + LIMIT 1} + assert_error {*invalid regular expression*} {r argrep myarray - + RE {(}} + } + + test {ARGREP rejects malformed braced hex regex escapes} { + r del myarray + r arset myarray 0 hello + + set invalid [format "\\%c%c1" 120 123] + assert_error {*invalid regular expression*} [list r argrep myarray - + RE $invalid] + assert_error {*invalid regular expression*} [list r argrep myarray - + RE $invalid NOCASE] + } + + test {ARSET contiguous write basics} { + r del myarray + assert_equal 3 [r arset myarray 0 a b c] + assert_equal a [r arget myarray 0] + assert_equal b [r arget myarray 1] + assert_equal c [r arget myarray 2] + } + + # ARINSERT tests + test {ARINSERT basics} { + r del myarray + assert_equal 0 [r arinsert myarray a] + assert_equal 1 [r arinsert myarray b] + assert_equal 2 [r arinsert myarray c] + + assert_equal a [r arget myarray 0] + assert_equal b [r arget myarray 1] + assert_equal c [r arget myarray 2] + } + + test {ARRING creates ring buffer} { + r del myarray + for {set i 0} {$i < 10} {incr i} { + r arring myarray 5 $i + } + + # After wrap, we should have indices 0-4 with values 5-9 + assert_equal 5 [r arget myarray 0] + assert_equal 6 [r arget myarray 1] + assert_equal 7 [r arget myarray 2] + assert_equal 8 [r arget myarray 3] + assert_equal 9 [r arget myarray 4] + assert_equal 5 [r arcount myarray] + } + + # ARNEXT, ARSEEK tests + test {ARNEXT tracks insert position} { + r del myarray + assert_equal 0 [r arnext myarray] + + r arinsert myarray a + assert_equal 1 [r arnext myarray] + + r arinsert myarray b + assert_equal 2 [r arnext myarray] + } + + test {ARSEEK} { + r del myarray + r arinsert myarray a + r arinsert myarray b + + assert_equal 1 [r arseek myarray 10] + r arinsert myarray c + assert_equal 11 [r arnext myarray] + assert_equal c [r arget myarray 10] + } + + test {ARNEXT returns null when insert cursor is exhausted} { + r del myarray + r arinsert myarray a + + # Move to terminal cursor state: insert_idx = UINT64_MAX-1 + r arseek myarray 18446744073709551615 + assert_equal {} [r arnext myarray] + assert_error {*insert index overflow*} {r arinsert myarray b} + } + + # ARLASTITEMS tests + test {ARLASTITEMS basics} { + r del myarray + for {set i 0} {$i < 5} {incr i} { + r arinsert myarray [expr $i * 10] + } + + set result [r arlastitems myarray 3] + assert_equal {20 30 40} $result + + set result [r arlastitems myarray 3 REV] + assert_equal {40 30 20} $result + } + + test {ARLASTITEMS after ARSEEK 0 uses array tail} { + r del myarray + for {set i 0} {$i < 5} {incr i} { + r arinsert myarray [expr $i * 10] + } + + assert_equal 1 [r arseek myarray 0] + assert_equal {20 30 40} [r arlastitems myarray 3] + assert_equal {40 30 20} [r arlastitems myarray 3 REV] + } + + # AROP tests + test {AROP SUM} { + r del myarray + r armset myarray 0 10 1 20 2 30 + + set result [r arop myarray 0 2 SUM] + assert_equal 60 $result + } + + test {AROP MIN} { + r del myarray + r armset myarray 0 30 1 10 2 20 + + set result [r arop myarray 0 2 MIN] + assert_equal 10 $result + } + + test {AROP MAX} { + r del myarray + r armset myarray 0 30 1 10 2 20 + + set result [r arop myarray 0 2 MAX] + assert_equal 30 $result + } + + test {AROP MATCH} { + r del myarray + r armset myarray 0 hello 1 world 2 hello 3 foo + + assert_equal 2 [r arop myarray 0 3 MATCH hello] + assert_equal 1 [r arop myarray 0 3 MATCH world] + assert_equal 0 [r arop myarray 0 3 MATCH bar] + } + + test {AROP USED} { + r del myarray + r armset myarray 0 a 2 b 5 c + + assert_equal 3 [r arop myarray 0 10 USED] + } + + test {AROP AND/OR/XOR} { + r del myarray + # Use decimal values: 255, 15, 240 + r armset myarray 0 255 1 15 2 240 + + assert_equal 0 [r arop myarray 0 2 AND] + assert_equal 255 [r arop myarray 0 2 OR] + assert_equal 0 [r arop myarray 0 2 XOR] + } + + test {AROP AND/OR/XOR truncates floats toward zero} { + r del myarray + # Truncated values: 7, 3, 1 + r armset myarray 0 7.9 1 3.2 2 1.8 + + assert_equal 1 [r arop myarray 0 2 AND] + assert_equal 7 [r arop myarray 0 2 OR] + assert_equal 5 [r arop myarray 0 2 XOR] + } + + # ARINFO tests + test {ARINFO basics} { + r del myarray + r armset myarray 0 a 1 b 100 c + + set info [r arinfo myarray] + assert_equal 3 [dict get $info count] + assert_equal 101 [dict get $info len] + } + + # Type check tests + test {Array commands on wrong type} { + r del mykey + r set mykey value + assert_error {WRONGTYPE*} {r arget mykey 0} + assert_error {WRONGTYPE*} {r arset mykey 0 foo} + assert_error {WRONGTYPE*} {r arlen mykey} + assert_error {WRONGTYPE*} {r arcount mykey} + } + + # TYPE command + test {TYPE returns array} { + r del myarray + r arset myarray 0 hello + assert_equal array [r type myarray] + } + + # OBJECT ENCODING command + test {OBJECT ENCODING returns sliced-array} { + r del myarray + r arset myarray 0 hello + assert_equal sliced-array [r object encoding myarray] + } + + # Sparse indices test + test {Sparse array with large gaps} { + r del myarray + r arset myarray 0 a + r arset myarray 10000 b + r arset myarray 1000000 c + + assert_equal a [r arget myarray 0] + assert_equal b [r arget myarray 10000] + assert_equal c [r arget myarray 1000000] + assert_equal 3 [r arcount myarray] + assert_equal 1000001 [r arlen myarray] + } + + # RDB persistence test + test {Array survives RDB save and load} { + r del myarray + r armset myarray 0 hello 1 world 100 test + r arseek myarray 101 + r arinsert myarray value + + r bgsave + waitForBgsave r + + r debug reload + assert_equal hello [r arget myarray 0] + assert_equal world [r arget myarray 1] + assert_equal test [r arget myarray 100] + assert_equal value [r arget myarray 101] + assert_equal 102 [r arnext myarray] + } {} {needs:debug} + + # ========================================================================= + # Edge case tests: directory resizing, slice transitions, window growth + # ========================================================================= + + # Directory resizing tests + test {Directory resize - many slices} { + r del myarray + # Default slice size is 4096, so indices 0, 4096, 8192, 12288, etc. + # create new slices requiring directory growth + set slice_size 4096 + for {set i 0} {$i < 20} {incr i} { + set idx [expr {$i * $slice_size}] + r arset myarray $idx "slice$i" + } + + # Verify all values + for {set i 0} {$i < 20} {incr i} { + set idx [expr {$i * $slice_size}] + assert_equal "slice$i" [r arget myarray $idx] + } + assert_equal 20 [r arcount myarray] + } + + test {Directory resize - very large index jump} { + r del myarray + r arset myarray 0 "start" + # Jump to a very high slice index, forcing directory allocation + r arset myarray 1000000 "middle" + r arset myarray 10000000 "end" + + assert_equal "start" [r arget myarray 0] + assert_equal "middle" [r arget myarray 1000000] + assert_equal "end" [r arget myarray 10000000] + assert_equal 3 [r arcount myarray] + } + + # Dense slice window growth tests + test {Dense window growth - right expansion} { + r del myarray + # Start with element at offset 0, then add elements going right + # Initial window is small (8 elements), this forces growth + for {set i 0} {$i < 100} {incr i} { + r arset myarray $i "val$i" + } + + # Verify all values stored correctly + for {set i 0} {$i < 100} {incr i} { + assert_equal "val$i" [r arget myarray $i] + } + assert_equal 100 [r arcount myarray] + + # Verify window grew (avg-dense-size should be >= 128 to fit 100 elements) + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert {[dict get $info avg-dense-size] >= 128} + } + + test {Dense window growth - left expansion} { + r del myarray + # Start with element at high offset, then add elements going left + # This forces window to expand leftward + r arset myarray 500 "anchor" + for {set i 499} {$i >= 400} {incr i -1} { + r arset myarray $i "val$i" + } + + assert_equal "anchor" [r arget myarray 500] + for {set i 400} {$i < 500} {incr i} { + assert_equal "val$i" [r arget myarray $i] + } + assert_equal 101 [r arcount myarray] + + # Verify window grew (avg-dense-size should be >= 128 to fit 101 elements) + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert {[dict get $info avg-dense-size] >= 128} + } + + test {Dense window growth - bidirectional expansion} { + r del myarray + # Start in middle, expand both directions + r arset myarray 500 "center" + for {set i 1} {$i <= 50} {incr i} { + r arset myarray [expr {500 - $i}] "left$i" + r arset myarray [expr {500 + $i}] "right$i" + } + + assert_equal "center" [r arget myarray 500] + for {set i 1} {$i <= 50} {incr i} { + assert_equal "left$i" [r arget myarray [expr {500 - $i}]] + assert_equal "right$i" [r arget myarray [expr {500 + $i}]] + } + assert_equal 101 [r arcount myarray] + + # Verify window grew (avg-dense-size should be >= 128 to fit 101 elements) + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert {[dict get $info avg-dense-size] >= 128} + } + + # Sparse to dense promotion tests + test {Sparse to dense promotion - exceed kmax threshold} { + r del myarray + # kmax default is 10, add 11+ elements to force promotion + # Use sparse pattern (scattered offsets within one slice) + for {set i 0} {$i < 15} {incr i} { + # Scattered within first slice (0-4095) + set idx [expr {$i * 100}] + r arset myarray $idx "sparse$i" + } + + # Verify all values after promotion + for {set i 0} {$i < 15} {incr i} { + set idx [expr {$i * 100}] + assert_equal "sparse$i" [r arget myarray $idx] + } + assert_equal 15 [r arcount myarray] + + # Verify promotion actually happened using ARINFO FULL + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert_equal 0 [dict get $info sparse-slices] + } + + test {Sparse to dense promotion - then continue adding} { + r del myarray + # First create sparse slice, then promote, then add more + for {set i 0} {$i < 5} {incr i} { + r arset myarray [expr {$i * 200}] "phase1_$i" + } + + # Verify starts as sparse + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + + # Add more to trigger promotion + for {set i 5} {$i < 20} {incr i} { + r arset myarray [expr {$i * 200}] "phase2_$i" + } + + # Verify all + for {set i 0} {$i < 20} {incr i} { + assert_equal "phase[expr {$i < 5 ? 1 : 2}]_$i" [r arget myarray [expr {$i * 200}]] + } + + # Verify promotion happened + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert_equal 0 [dict get $info sparse-slices] + } + + # Dense to sparse demotion tests + test {Dense to sparse demotion - delete below kmin threshold} { + r del myarray + # Create dense slice with many elements + for {set i 0} {$i < 50} {incr i} { + r arset myarray $i "val$i" + } + assert_equal 50 [r arcount myarray] + + # Verify starts as dense + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert_equal 0 [dict get $info sparse-slices] + + # Delete most elements, leaving only 3 (below kmin=5) + for {set i 3} {$i < 50} {incr i} { + r ardel myarray $i + } + + # Verify remaining elements + assert_equal "val0" [r arget myarray 0] + assert_equal "val1" [r arget myarray 1] + assert_equal "val2" [r arget myarray 2] + assert_equal 3 [r arcount myarray] + + # Verify demotion happened + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + } + + test {Dense to sparse demotion - then add again} { + r del myarray + # Create dense, demote to sparse, then add more + for {set i 0} {$i < 30} {incr i} { + r arset myarray $i "initial$i" + } + + # Delete to demote + for {set i 4} {$i < 30} {incr i} { + r ardel myarray $i + } + assert_equal 4 [r arcount myarray] + + # Verify demotion happened + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + + # Add new elements (should work in sparse mode) + for {set i 100} {$i < 105} {incr i} { + r arset myarray $i "new$i" + } + + # Verify old and new + for {set i 0} {$i < 4} {incr i} { + assert_equal "initial$i" [r arget myarray $i] + } + for {set i 100} {$i < 105} {incr i} { + assert_equal "new$i" [r arget myarray $i] + } + } + + # Combined stress test + test {Stress test - mixed operations across multiple slices} { + r del myarray + set slice_size 4096 + + # Create elements across 5 slices + for {set slice 0} {$slice < 5} {incr slice} { + set base [expr {$slice * $slice_size}] + # Add 20 elements per slice + for {set i 0} {$i < 20} {incr i} { + r arset myarray [expr {$base + $i * 50}] "s${slice}_e$i" + } + } + assert_equal 100 [r arcount myarray] + + # Delete half from each slice (should cause some demotions) + for {set slice 0} {$slice < 5} {incr slice} { + set base [expr {$slice * $slice_size}] + for {set i 10} {$i < 20} {incr i} { + r ardel myarray [expr {$base + $i * 50}] + } + } + assert_equal 50 [r arcount myarray] + + # Verify remaining elements + for {set slice 0} {$slice < 5} {incr slice} { + set base [expr {$slice * $slice_size}] + for {set i 0} {$i < 10} {incr i} { + assert_equal "s${slice}_e$i" [r arget myarray [expr {$base + $i * 50}]] + } + } + } + + test {Stress test - rapid insert/delete cycles} { + r del myarray + + # Multiple cycles of growth and shrinkage + for {set cycle 0} {$cycle < 3} {incr cycle} { + # Grow + for {set i 0} {$i < 100} {incr i} { + r arset myarray $i "cycle${cycle}_$i" + } + assert_equal 100 [r arcount myarray] + + # Shrink (but leave some) + for {set i 10} {$i < 100} {incr i} { + r ardel myarray $i + } + assert_equal 10 [r arcount myarray] + } + + # Verify final state + for {set i 0} {$i < 10} {incr i} { + assert_equal "cycle2_$i" [r arget myarray $i] + } + } + + # RDB with complex state + test {RDB persistence with sparse and dense slices} { + r del myarray + + # Create mix of sparse and dense slices + # Slice 0: dense (many elements) + for {set i 0} {$i < 50} {incr i} { + r arset myarray $i "dense$i" + } + + # Slice 1 (offset 4096): sparse (few elements) + r arset myarray 4096 "sparse0" + r arset myarray 4200 "sparse1" + r arset myarray 4500 "sparse2" + + # Slice 10 (offset 40960): single element + r arset myarray 40960 "lonely" + + r bgsave + waitForBgsave r + r debug reload + + # Verify all types survived + for {set i 0} {$i < 50} {incr i} { + assert_equal "dense$i" [r arget myarray $i] + } + assert_equal "sparse0" [r arget myarray 4096] + assert_equal "sparse1" [r arget myarray 4200] + assert_equal "sparse2" [r arget myarray 4500] + assert_equal "lonely" [r arget myarray 40960] + } {} {needs:debug} + + # Regression test for dense window boundary bug (GitHub issue) + # When a dense slice window doubles but doesn't reach ar_slice_size, + # offset + winsize could exceed the slice boundary (4096), causing crashes. + test {Regression - dense window growth must not exceed slice boundary} { + r del myarray + set slice_size 4096 + + # Create a dense slice with elements at high offsets within the slice. + # Start at offset 2100 with a small window, then force growth. + # Initial window: offset=2100, winsize=64 (or similar small power of 2) + r arset myarray 2100 "start" + + # Add elements to grow the window to the right. + # After several doublings, winsize might become 2048. + # With offset=2100 and winsize=2048, end would be 4148 > 4096 (BUG!) + # The fix adjusts offset so the window stays within bounds. + for {set i 2101} {$i < 2200} {incr i} { + r arset myarray $i "val$i" + } + + # Now force further right growth that would exceed boundary without fix + for {set i 2200} {$i < 3500} {incr i 10} { + r arset myarray $i "val$i" + } + + # Verify all values are accessible (would crash before the fix) + assert_equal "start" [r arget myarray 2100] + assert_equal "val2150" [r arget myarray 2150] + assert_equal "val3000" [r arget myarray 3000] + + # Verify window respects slice boundary via ARINFO FULL + set info [r arinfo myarray FULL] + set avg_size [dict get $info avg-dense-size] + # With the fix, window should be properly sized (at most slice_size) + assert {$avg_size <= $slice_size} + } + + test {Regression - sparse to dense promotion with high offset boundary} { + r del myarray + set slice_size 4096 + + # Create sparse slice with elements near upper boundary of slice + # This tests arSparsePromote boundary handling + for {set i 0} {$i < 8} {incr i} { + set idx [expr {2200 + $i * 100}] ;# 2200, 2300, ..., 2900 + r arset myarray $idx "sparse$i" + } + + # Verify starts as sparse + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info sparse-slices] + + # Add more to trigger promotion - elements span 2200 to 3800 + # Window needs to cover 2200-3800 range (1601 elements span) + # Without boundary fix, offset=2200 + winsize=2048 = 4248 > 4096 (BUG!) + for {set i 8} {$i < 20} {incr i} { + set idx [expr {2200 + $i * 100}] ;# continues: 3000, 3100, ..., 4100 + r arset myarray $idx "promoted$i" + } + + # Verify all values survived promotion (would crash before fix) + for {set i 0} {$i < 8} {incr i} { + set idx [expr {2200 + $i * 100}] + assert_equal "sparse$i" [r arget myarray $idx] + } + for {set i 8} {$i < 20} {incr i} { + set idx [expr {2200 + $i * 100}] + assert_equal "promoted$i" [r arget myarray $idx] + } + } + + # Helper to generate random values of different encoding types + proc random_value {} { + set type [expr {int(rand() * 4)}] + switch $type { + 0 { + # INT encoding: small integers + set val [expr {int(rand() * 200000) - 100000}] + } + 1 { + # FLOAT encoding: synthetic float with random digits + set int_digits [expr {int(rand() * 5) + 1}] ;# 1-5 digits before dot + set frac_digits [expr {int(rand() * 5) + 1}] ;# 1-5 digits after dot + set int_part "" + for {set i 0} {$i < $int_digits} {incr i} { + append int_part [expr {int(rand() * 10)}] + } + set frac_part "" + for {set i 0} {$i < $frac_digits} {incr i} { + append frac_part [expr {int(rand() * 10)}] + } + # Add negative sign randomly + set sign [expr {rand() < 0.5 ? "-" : ""}] + set val "${sign}${int_part}.${frac_part}" + } + 2 { + # SMALLSTR encoding: short strings (1-6 bytes) + set len [expr {int(rand() * 6) + 1}] + set val "" + for {set i 0} {$i < $len} {incr i} { + append val [format %c [expr {int(rand() * 26) + 97}]] ;# a-z + } + } + 3 { + # arString pointer: longer strings (10-30 bytes) + set len [expr {int(rand() * 21) + 10}] + set val "" + for {set i 0} {$i < $len} {incr i} { + append val [format %c [expr {int(rand() * 26) + 97}]] ;# a-z + } + } + } + return $val + } + + proc random_array_index {} { + set roll [expr {int(rand() * 100)}] + if {$roll < 35} { + return [expr {int(rand() * 256)}] + } elseif {$roll < 55} { + return [expr {4096 + int(rand() * 512)}] + } elseif {$roll < 75} { + return [expr {8388608 + int(rand() * 8192)}] + } elseif {$roll < 90} { + return [expr {16777216 + int(rand() * 8192)}] + } else { + return [expr {int(rand() * 30000000)}] + } + } + + proc model_array_delrange {arrname lo hi} { + upvar 1 $arrname expected + + if {$lo > $hi} { + set tmp $lo + set lo $hi + set hi $tmp + } + + set deleted 0 + foreach idx [array names expected] { + if {$idx >= $lo && $idx <= $hi} { + unset expected($idx) + incr deleted + } + } + return $deleted + } + + proc model_array_scan {arrname} { + upvar 1 $arrname expected + + set result {} + foreach idx [lsort -integer [array names expected]] { + lappend result [list $idx $expected($idx)] + } + return $result + } + + proc iterator_stress_rand_between {lo hi} { + return [expr {$lo + int(rand() * ($hi - $lo + 1))}] + } + + proc iterator_stress_random_index {slice_size mode} { + set roll [expr {int(rand() * 100)}] + switch -- $mode { + mixed { + if {$roll < 25} { + return [expr {int(rand() * ($slice_size * 2))}] + } elseif {$roll < 45} { + return [expr {$slice_size - 4 + int(rand() * 9)}] + } elseif {$roll < 60} { + return [expr {$slice_size * 2 - 4 + int(rand() * 9)}] + } elseif {$roll < 78} { + return [expr {8388608 + int(rand() * ($slice_size * 2))}] + } elseif {$roll < 92} { + return [expr {16777216 + int(rand() * ($slice_size * 2))}] + } else { + return [expr {int(rand() * 30000000)}] + } + } + dense { + if {$roll < 60} { + return [expr {int(rand() * ($slice_size * 2))}] + } elseif {$roll < 80} { + return [expr {$slice_size - 8 + int(rand() * 17)}] + } else { + return [expr {int(rand() * ($slice_size * 8))}] + } + } + superdir { + if {$roll < 20} { + return [expr {int(rand() * 1024)}] + } elseif {$roll < 45} { + return [expr {8388608 + int(rand() * ($slice_size * 4))}] + } elseif {$roll < 70} { + return [expr {16777216 + int(rand() * ($slice_size * 4))}] + } elseif {$roll < 90} { + return [expr {25165824 + int(rand() * ($slice_size * 4))}] + } else { + return [expr {int(rand() * 40000000)}] + } + } + } + return [expr {int(rand() * 30000000)}] + } + + proc iterator_stress_sorted_indices {arrname reverse} { + upvar 1 $arrname model + if {$reverse} { + return [lsort -integer -decreasing [array names model]] + } + return [lsort -integer [array names model]] + } + + proc iterator_stress_scan {arrname start end limit} { + upvar 1 $arrname model + set reverse [expr {$start > $end}] + set lo [expr {$reverse ? $end : $start}] + set hi [expr {$reverse ? $start : $end}] + set result {} + set emitted 0 + + foreach idx [iterator_stress_sorted_indices model $reverse] { + if {$idx < $lo || $idx > $hi} continue + lappend result [list $idx $model($idx)] + incr emitted + if {$limit > 0 && $emitted >= $limit} break + } + return $result + } + + proc iterator_stress_argrep {arrname start end type pattern nocase withvalues limit} { + upvar 1 $arrname model + set reverse [expr {$start > $end}] + set lo [expr {$reverse ? $end : $start}] + set hi [expr {$reverse ? $start : $end}] + set pattern_cmp $pattern + if {$nocase} { set pattern_cmp [string tolower $pattern_cmp] } + set result {} + set emitted 0 + + foreach idx [iterator_stress_sorted_indices model $reverse] { + if {$idx < $lo || $idx > $hi} continue + set value $model($idx) + set cmp $value + if {$nocase} { set cmp [string tolower $cmp] } + + if {$type eq "EXACT"} { + set match [expr {$cmp eq $pattern_cmp}] + } else { + set match [expr {[string first $pattern_cmp $cmp] != -1}] + } + + if {$match} { + if {$withvalues} { + lappend result [list $idx $value] + } else { + lappend result $idx + } + incr emitted + if {$emitted >= $limit} break + } + } + return $result + } + + proc iterator_stress_arop_used {arrname start end} { + upvar 1 $arrname model + set lo [expr {$start > $end ? $end : $start}] + set hi [expr {$start > $end ? $start : $end}] + set used 0 + + foreach idx [array names model] { + if {$idx >= $lo && $idx <= $hi} { incr used } + } + return $used + } + + proc iterator_stress_arop_match {arrname start end needle} { + upvar 1 $arrname model + set lo [expr {$start > $end ? $end : $start}] + set hi [expr {$start > $end ? $start : $end}] + set matches 0 + + foreach idx [array names model] { + if {$idx >= $lo && $idx <= $hi && $model($idx) eq $needle} { + incr matches + } + } + return $matches + } + + proc iterator_stress_arop_sum {arrname start end} { + upvar 1 $arrname model + set lo [expr {$start > $end ? $end : $start}] + set hi [expr {$start > $end ? $start : $end}] + set sum 0.0 + set has_numeric 0 + + foreach idx [array names model] { + if {$idx < $lo || $idx > $hi} continue + if {[string is double -strict $model($idx)]} { + set sum [expr {$sum + ($model($idx) + 0.0)}] + set has_numeric 1 + } + } + + if {!$has_numeric} { return {} } + return $sum + } + + proc iterator_stress_pick_existing_value {arrname} { + upvar 1 $arrname model + set keys [array names model] + if {[llength $keys] == 0} { return [random_value] } + return $model([lindex $keys [expr {int(rand() * [llength $keys])}]]) + } + + proc iterator_stress_pick_match_pattern {value} { + set len [string length $value] + if {$len <= 1} { return $value } + set start [expr {int(rand() * $len)}] + set width [expr {1 + int(rand() * ($len - $start))}] + return [string range $value $start [expr {$start + $width - 1}]] + } + + proc iterator_stress_flip_case {value} { + set out "" + foreach ch [split $value ""] { + if {![string is alpha -strict $ch] || rand() < 0.5} { + append out $ch + } elseif {$ch eq [string tolower $ch]} { + append out [string toupper $ch] + } else { + append out [string tolower $ch] + } + } + return $out + } + + proc iterator_stress_check_equal {label expected got} { + if {$expected ne $got} { + fail "$label mismatch - expected '$expected', got '$got'" + } + } + + proc iterator_stress_check_sum {label expected got} { + if {$expected eq {} || $got eq {}} { + if {$expected ne $got} { + fail "$label mismatch - expected '$expected', got '$got'" + } + return + } + + if {abs(($expected + 0.0) - ($got + 0.0)) > 1e-9} { + fail "$label mismatch - expected '$expected', got '$got'" + } + } + + proc iterator_stress_validate {r arrname slice_size mode tag step full_scan} { + upvar 1 $arrname model + set count [array size model] + + if {$count == 0} { + iterator_stress_check_equal "$tag/$step exists" 0 [r exists myarray] + if {$full_scan} { + iterator_stress_check_equal "$tag/$step empty-scan" {} \ + [r arscan myarray 0 50000000] + } + return + } + + iterator_stress_check_equal "$tag/$step count" $count [r arcount myarray] + if {$full_scan} { + set start [expr {$step % 2 == 0 ? 0 : 50000000}] + set end [expr {$step % 2 == 0 ? 50000000 : 0}] + iterator_stress_check_equal "$tag/$step full-scan" \ + [iterator_stress_scan model $start $end 0] \ + [r arscan myarray $start $end] + } + + for {set probe 0} {$probe < 2} {incr probe} { + set start [iterator_stress_random_index $slice_size $mode] + set end [iterator_stress_random_index $slice_size $mode] + if {rand() < 0.15} { set start 0 } + if {rand() < 0.15} { set end 50000000 } + + set limit [iterator_stress_rand_between 1 10] + iterator_stress_check_equal "$tag/$step scan/$probe" \ + [iterator_stress_scan model $start $end $limit] \ + [r arscan myarray $start $end LIMIT $limit] + + set grep_type [expr {rand() < 0.5 ? "EXACT" : "MATCH"}] + if {rand() < 0.7} { + set pattern [iterator_stress_pick_existing_value model] + if {$grep_type eq "MATCH"} { + set pattern [iterator_stress_pick_match_pattern $pattern] + } + } else { + set pattern [random_value] + } + + set withvalues [expr {rand() < 0.5}] + set nocase [expr {rand() < 0.5}] + if {$nocase} { set pattern [iterator_stress_flip_case $pattern] } + set grep_limit [iterator_stress_rand_between 1 8] + set grep_cmd [list r argrep myarray $start $end $grep_type $pattern LIMIT $grep_limit] + if {$withvalues} { lappend grep_cmd WITHVALUES } + if {$nocase} { lappend grep_cmd NOCASE } + + iterator_stress_check_equal "$tag/$step argrep/$probe" \ + [iterator_stress_argrep model $start $end $grep_type $pattern $nocase $withvalues $grep_limit] \ + [uplevel 1 $grep_cmd] + + iterator_stress_check_equal "$tag/$step used/$probe" \ + [iterator_stress_arop_used model $start $end] \ + [r arop myarray $start $end USED] + + set needle [iterator_stress_pick_existing_value model] + iterator_stress_check_equal "$tag/$step match/$probe" \ + [iterator_stress_arop_match model $start $end $needle] \ + [r arop myarray $start $end MATCH $needle] + + iterator_stress_check_sum "$tag/$step sum/$probe" \ + [iterator_stress_arop_sum model $start $end] \ + [r arop myarray $start $end SUM] + } + } + + proc iterator_stress_apply_operation {r arrname slice_size mode} { + upvar 1 $arrname model + set roll [expr {int(rand() * 100)}] + + if {$roll < 30} { + set idx [iterator_stress_random_index $slice_size $mode] + set val [random_value] + r arset myarray $idx $val + set model($idx) $val + } elseif {$roll < 45} { + set start [iterator_stress_random_index $slice_size $mode] + set values {} + set len [iterator_stress_rand_between 2 8] + + for {set i 0} {$i < $len} {incr i} { + set val [random_value] + lappend values $val + set model([expr {$start + $i}]) $val + } + r arset myarray $start {*}$values + } elseif {$roll < 58} { + set idx [iterator_stress_random_index $slice_size $mode] + r ardel myarray $idx + catch {unset model($idx)} + } elseif {$roll < 78} { + set args {} + set nranges [iterator_stress_rand_between 1 3] + + for {set i 0} {$i < $nranges} {incr i} { + set lo [iterator_stress_random_index $slice_size $mode] + set hi [iterator_stress_random_index $slice_size $mode] + lappend args $lo $hi + model_array_delrange model $lo $hi + } + r ardelrange myarray {*}$args + } elseif {$roll < 90} { + set base [expr {[iterator_stress_random_index $slice_size $mode] / $slice_size * $slice_size}] + set start [expr {$base + [iterator_stress_rand_between 0 [expr {$slice_size > 16 ? 16 : $slice_size - 1}]]}] + set values {} + set len [iterator_stress_rand_between 4 10] + + for {set i 0} {$i < $len} {incr i} { + set val [random_value] + lappend values $val + set model([expr {$start + $i}]) $val + } + r arset myarray $start {*}$values + } else { + set base [expr {[iterator_stress_random_index $slice_size $mode] / $slice_size * $slice_size}] + set lo [expr {$base + [iterator_stress_rand_between 0 [expr {$slice_size > 24 ? 24 : $slice_size - 1}]]}] + set hi [expr {$base + [iterator_stress_rand_between 0 [expr {$slice_size > 24 ? 24 : $slice_size - 1}]]}] + model_array_delrange model $lo $hi + r ardelrange myarray $lo $hi + } + } + + # Random testing - most effective way to find edge case bugs + test {Random testing - staged write/delete workload with verification} { + r flushdb + expr {srand(12345)} ;# Fixed seed for reproducibility + set max_idx 5000 ;# Range of possible indices + set ops_per_stage 200 ;# Operations per stage + + # Tcl-side tracking of expected state + array set expected {} + + # 11 stages with decreasing write ratio + # Stage 0: 100% writes, Stage 10: 0% writes 100% deletes + set stages { + {100 0} + {90 10} + {80 20} + {70 30} + {60 40} + {50 50} + {40 60} + {30 70} + {20 80} + {10 90} + {0 100} + } + + set stage_num 0 + foreach stage $stages { + set write_pct [lindex $stage 0] + + for {set op 0} {$op < $ops_per_stage} {incr op} { + set roll [expr {int(rand() * 100)}] + set idx [expr {int(rand() * $max_idx)}] + + if {$roll < $write_pct} { + # Write operation with random value type + set val [random_value] + r arset myarray $idx $val + set expected($idx) $val + } else { + # Delete operation - always send to Redis, track locally + r ardel myarray $idx + if {[info exists expected($idx)]} { + unset expected($idx) + } + } + } + + # Verify entire array matches expected state + set expected_count [array size expected] + if {[r exists myarray]} { + set actual_count [r arcount myarray] + } else { + set actual_count 0 + } + + if {$expected_count != $actual_count} { + fail "Stage $stage_num: count mismatch - expected $expected_count, got $actual_count" + } + + # Verify all expected values individually + foreach idx [array names expected] { + set got [r arget myarray $idx] + if {$got ne $expected($idx)} { + fail "Stage $stage_num: idx $idx - expected '$expected($idx)', got '$got'" + } + } + + incr stage_num + } + + # Final cleanup: delete all remaining expected entries + foreach idx [array names expected] { + r ardel myarray $idx + unset expected($idx) + } + + # After cleanup, array should be empty/deleted + assert_equal 0 [r exists myarray] + } + + test {Random testing - large scale with RDB verification} { + r flushdb + expr {srand(54321)} ;# Fixed seed for reproducibility + set max_idx 100000 ;# Range to test multiple slices + set num_writes 2000 + + # Tcl-side tracking + array set expected {} + + # Phase 1: Random writes with mixed value types + for {set i 0} {$i < $num_writes} {incr i} { + set idx [expr {int(rand() * $max_idx)}] + set val [random_value] + r arset myarray $idx $val + set expected($idx) $val + } + + set expected_count [array size expected] + set count_before [r arcount myarray] + assert_equal $expected_count $count_before + + # Save and reload + r bgsave + waitForBgsave r + r debug reload + + # Verify count preserved + assert_equal $count_before [r arcount myarray] + + # Verify all expected values + foreach idx [array names expected] { + set got [r arget myarray $idx] + if {$got ne $expected($idx)} { + fail "After reload: idx $idx - expected '$expected($idx)', got '$got'" + } + } + + # Phase 2: Random deletes (delete half) + set keys_list [array names expected] + set delete_count [expr {[llength $keys_list] / 2}] + for {set i 0} {$i < $delete_count} {incr i} { + set idx [lindex $keys_list $i] + r ardel myarray $idx + unset expected($idx) + } + + # Verify remaining + set remaining [array size expected] + assert_equal $remaining [r arcount myarray] + + foreach idx [array names expected] { + assert_equal $expected($idx) [r arget myarray $idx] + } + } {} {needs:debug} + + test {Random testing - iterator model stress across dense sparse and superdir} { + set orig_slice_size [lindex [r config get array-slice-size] 1] + set orig_kmax [lindex [r config get array-sparse-kmax] 1] + set orig_kmin [lindex [r config get array-sparse-kmin] 1] + set scenarios { + {mixed-default 4096 10 5 mixed 120 111} + {small-slices 256 6 3 dense 140 333} + {superdir-heavy 1024 8 4 superdir 160 555} + {superdir-heavy 1024 8 4 superdir 160 666} + } + + set err [catch { + foreach scenario $scenarios { + lassign $scenario name slice_size kmax kmin mode steps seed + r flushdb + r config set array-sparse-kmax $kmax + r config set array-sparse-kmin $kmin + r config set array-slice-size $slice_size + expr {srand($seed)} + catch {array unset model} + array set model {} + + # Start each scenario with the exact superdir shape that + # previously exposed iterator progress bugs. + r arset myarray 43 a + set model(43) a + r arset myarray [expr {$slice_size + 490}] b + set model([expr {$slice_size + 490}]) b + r arset myarray 19245258 c + set model(19245258) c + + iterator_stress_validate r model $slice_size $mode "$name/$seed" -1 1 + + for {set step 0} {$step < $steps} {incr step} { + iterator_stress_apply_operation r model $slice_size $mode + iterator_stress_validate r model $slice_size $mode \ + "$name/$seed" $step [expr {$step % 20 == 0}] + } + } + } msg opts] + + r flushdb + r config set array-sparse-kmax $orig_kmax + r config set array-sparse-kmin $orig_kmin + r config set array-slice-size $orig_slice_size + + if {$err} { + return -options $opts $msg + } + } + + # ========================================================================= + # Circular buffer (ring buffer) comprehensive tests + # ========================================================================= + + test {Circular buffer - ARRING basic wraparound} { + r del myarray + # Insert 20 values with MOD 10 - should wrap around twice + for {set i 0} {$i < 20} {incr i} { + set result [r arring myarray 10 "val$i"] + assert_equal [expr {$i % 10}] $result + } + # Should have exactly 10 elements (0-9) + assert_equal 10 [r arcount myarray] + # Values should be the last 10 inserted (val10-val19) + for {set i 0} {$i < 10} {incr i} { + assert_equal "val[expr {$i + 10}]" [r arget myarray $i] + } + } + + test {Circular buffer - ARRING with size 1} { + r del myarray + # MOD 1 means only ever keep one element at index 0 + for {set i 0} {$i < 100} {incr i} { + r arring myarray 1 "val$i" + } + assert_equal 1 [r arcount myarray] + assert_equal "val99" [r arget myarray 0] + } + + test {Circular buffer - ARRING preserves insert_idx through RDB} { + r del myarray + # Create a circular buffer, wrap around a few times + for {set i 0} {$i < 15} {incr i} { + r arring myarray 5 "val$i" + } + # insert_idx should now be 0 (15 % 5 = 0) + set next_before [r arnext myarray] + + # Save and reload + r bgsave + waitForBgsave r + r debug reload + + # Verify insert_idx is preserved + assert_equal $next_before [r arnext myarray] + + # Continue inserting - should continue from where it left off + r arring myarray 5 "after_reload" + # The next insert should be at position 1 (since we were at 0) + assert_equal "after_reload" [r arget myarray [expr {$next_before % 5}]] + } {} {needs:debug} + + test {Circular buffer - ARLASTITEMS with wraparound} { + r del myarray + # Create circular buffer with 8 items, MOD 5 + for {set i 0} {$i < 8} {incr i} { + r arring myarray 5 $i + } + # Values: 0->3, 1->4, 2->5, 3->6, 4->7 + # insert_idx = 3 (8 % 5 = 3) + + # ARLASTITEMS should return the N most recently inserted + set result [r arlastitems myarray 3] + # Last 3 inserted: 7, 6, 5 - in chronological order: 5, 6, 7 + assert_equal {5 6 7} $result + + # With REV flag + set result [r arlastitems myarray 3 REV] + assert_equal {7 6 5} $result + + # Request more items than exist + set result [r arlastitems myarray 10] + assert_equal 5 [llength $result] + } + + test {Circular buffer - ARLASTITEMS handles empty and partial cases} { + r del myarray + # Empty array + set result [r arlastitems myarray 5] + assert_equal {} $result + + # Fewer items than requested (no wraparound yet) + r arring myarray 10 a + r arring myarray 10 b + r arring myarray 10 c + + set result [r arlastitems myarray 5] + assert_equal {a b c} $result + } + + test {Circular buffer - ARNEXT tracks correctly with ARRING} { + r del myarray + # Insert with MOD, tracking position + # MOD wraps the insert position but ARNEXT continues until next wrap + for {set i 0} {$i < 7} {incr i} { + set expected_idx [expr {$i % 4}] + set result [r arring myarray 4 $i] + assert_equal $expected_idx $result + # ARNEXT: after a wraparound insert, it's expected_idx+1 + # Otherwise it's the running counter+1 until it wraps + if {$i < 4} { + # Before first wrap, ARNEXT is i+1 + assert_equal [expr {$i + 1}] [r arnext myarray] + } else { + # After wrap, ARNEXT is (position+1) + assert_equal [expr {$expected_idx + 1}] [r arnext myarray] + } + } + } + + test {Circular buffer - ARSEEK followed by ARRING} { + r del myarray + # Start inserting + r arinsert myarray a + r arinsert myarray b + r arinsert myarray c + # insert_idx = 2, next = 3 + + # Seek to position 10 + r arseek myarray 10 + assert_equal 10 [r arnext myarray] + + # Now use MOD - should reset behavior + r arring myarray 5 x + # This should insert at index 0 (10 % 5 = 0) + assert_equal x [r arget myarray 0] + } + + test {Circular buffer - ARSEEK 0 is honored on ARRING grow} { + r del myarray + for {set i 0} {$i < 5} {incr i} { + r arring myarray 3 "ring$i" + } + + assert_equal 1 [r arseek myarray 0] + r arring myarray 8 "grown" + + # ARSEEK 0 is an explicit cursor override, so grow should not repack + # first: the next ARRING write still goes to index 0. + assert_equal "grown" [r arget myarray 0] + assert_equal "ring4" [r arget myarray 1] + assert_equal "ring2" [r arget myarray 2] + assert_equal 1 [r arnext myarray] + } + + test {Circular buffer - ARRING growth uses new capacity after wrap} { + r del myarray + for {set i 0} {$i < 8} {incr i} { + r arring myarray 5 "v$i" + } + # Current ring window contains the latest 5 values: + # v3 v4 v5 v6 v7, with insert_idx at position 2. + + r arring myarray 8 "grown" + + # Growing must compact the wrapped ring first, so the new value uses + # the newly added capacity instead of overwriting low indexes again. + assert_equal "v3" [r arget myarray 0] + assert_equal "v4" [r arget myarray 1] + assert_equal "v5" [r arget myarray 2] + assert_equal "v6" [r arget myarray 3] + assert_equal "v7" [r arget myarray 4] + assert_equal "grown" [r arget myarray 5] + assert_equal 6 [r arnext myarray] + } + + test {Circular buffer - Mixed ARSET and ARRING immediately restores ring size} { + r del myarray + # Use MOD to create ring buffer + for {set i 0} {$i < 5} {incr i} { + r arring myarray 3 "ring$i" + } + # After 5 inserts with MOD 3: + # Position 0: ring0 -> ring3 (overwritten) + # Position 1: ring1 -> ring4 (overwritten) + # Position 2: ring2 + # insert_idx=1, next=2 + + # Now manually set a value outside the ring + r arset myarray 100 "outside" + + # Ring buffer values should still be there + assert_equal "ring3" [r arget myarray 0] + assert_equal "ring4" [r arget myarray 1] + assert_equal "ring2" [r arget myarray 2] + assert_equal "outside" [r arget myarray 100] + + # Continue ring buffer. The ring size should be re-established + # immediately, so values outside the 0..2 window disappear at once. + r arring myarray 3 "ring5" + assert_equal 3 [r arcount myarray] + assert_equal {} [r arget myarray 100] + assert_equal "ring5" [r arget myarray 0] + } + + test {Circular buffer - insert_idx survives RDB with complex state} { + r del myarray + # Create circular buffer across multiple slices + for {set i 0} {$i < 100} {incr i} { + # Use large MOD to spread across slices + r arring myarray 50 "v$i" + } + + set info_before [r arinfo myarray] + set next_before [r arnext myarray] + set count_before [r arcount myarray] + + # Also set some values outside the ring + r arset myarray 10000 "far_away" + + # Save and reload + r bgsave + waitForBgsave r + r debug reload + + # Verify state preserved + assert_equal $count_before [expr {[r arcount myarray] - 1}] ;# -1 for far_away + assert_equal $next_before [r arnext myarray] + assert_equal "far_away" [r arget myarray 10000] + + # Verify ring buffer content - last 50 values should be v50-v99 + for {set i 0} {$i < 50} {incr i} { + assert_equal "v[expr {$i + 50}]" [r arget myarray $i] + } + } {} {needs:debug} + + test {Circular buffer - ARLASTITEMS reverse order} { + r del myarray + # Create ring with wraparound + for {set i 0} {$i < 12} {incr i} { + r arring myarray 8 "v$i" + } + # After 12 inserts MOD 8: + # insert_idx = 12 % 8 = 4 - 1 = 3 (last inserted at position 3) + # Values: positions 0-7 contain v4-v11 + + # ARLASTITEMS returns most recent items in chronological order + set result [r arlastitems myarray 4] + # Last 4 inserted were v11, v10, v9, v8 - returned oldest to newest + assert_equal {v8 v9 v10 v11} $result + + # With REV flag - returned newest to oldest + set result [r arlastitems myarray 4 REV] + assert_equal {v11 v10 v9 v8} $result + + # Request all items + set result [r arlastitems myarray 100] + assert_equal 8 [llength $result] + } + + test {Circular buffer - ARRING truncation when size decreases} { + r del myarray + # Create ring buffer with MOD 10 + for {set i 0} {$i < 15} {incr i} { + r arring myarray 10 "v$i" + } + # Now have 10 elements at positions 0-9 + # After 15 inserts: 0->v10, 1->v11, ..., 4->v14, 5->v5, ..., 9->v9 + assert_equal 10 [r arcount myarray] + + # Use smaller MOD - this truncates to positions 0-4 AND inserts new value + # The new insert goes to position (15 % 5) = 0, replacing v10 + r arring myarray 5 "truncated" + # Now have only 5 elements (positions 0-4), with position 0 = "truncated" + assert_equal 5 [r arcount myarray] + + # Verify values + assert_equal "truncated" [r arget myarray 0] ;# new value + assert_equal "v11" [r arget myarray 1] + assert_equal "v12" [r arget myarray 2] + assert_equal "v13" [r arget myarray 3] + assert_equal "v14" [r arget myarray 4] + + # Positions 5-9 should be empty (truncated) + assert_equal {} [r arget myarray 5] + assert_equal {} [r arget myarray 9] + } + + test {Circular buffer - ARRING shrink stops at first hole} { + r del myarray + for {set i 0} {$i < 5} {incr i} { + r arring myarray 5 "v$i" + } + + r ardel myarray 3 + r arring myarray 3 "new" + + assert_equal 2 [r arcount myarray] + assert_equal "v4" [r arget myarray 0] + assert_equal "new" [r arget myarray 1] + assert_equal {} [r arget myarray 2] + } + + test {Circular buffer - ARRING grow stops at first hole} { + r del myarray + for {set i 0} {$i < 8} {incr i} { + r arring myarray 5 "v$i" + } + + r ardel myarray 1 + r arring myarray 8 "grown" + + assert_equal 2 [r arcount myarray] + assert_equal "v7" [r arget myarray 0] + assert_equal "grown" [r arget myarray 1] + assert_equal {} [r arget myarray 2] + } + + test {Circular buffer - ARLASTITEMS with various counts and REV} { + r del myarray + # Create simple ring buffer + for {set i 0} {$i < 20} {incr i} { + r arring myarray 10 "item$i" + } + # Contains item10-item19 at positions 0-9 + + # Get exactly 1 item + assert_equal {item19} [r arlastitems myarray 1] + assert_equal {item19} [r arlastitems myarray 1 REV] + + # Get 3 items + set result [r arlastitems myarray 3] + assert_equal {item17 item18 item19} $result + set result [r arlastitems myarray 3 REV] + assert_equal {item19 item18 item17} $result + + # Get all 10 items + set result [r arlastitems myarray 10] + assert_equal 10 [llength $result] + assert_equal "item10" [lindex $result 0] + assert_equal "item19" [lindex $result end] + + # REV order for all items + set result [r arlastitems myarray 10 REV] + assert_equal "item19" [lindex $result 0] + assert_equal "item10" [lindex $result end] + } + + test {Circular buffer - ARLASTITEMS edge cases} { + r del myarray + # Empty array + assert_equal {} [r arlastitems myarray 5] + assert_equal {} [r arlastitems myarray 5 REV] + + # Single element + r arinsert myarray "only" + assert_equal {only} [r arlastitems myarray 1] + assert_equal {only} [r arlastitems myarray 10] + assert_equal {only} [r arlastitems myarray 1 REV] + + # Two elements - no wraparound yet + r arinsert myarray "second" + assert_equal {only second} [r arlastitems myarray 5] + assert_equal {second only} [r arlastitems myarray 5 REV] + } + + # ============================================================ + # Regression tests for bugs found during code review + # ============================================================ + + test {Regression #3 - arTruncate must decrement count correctly} { + r del myarray + # Fill array with 20 elements + for {set i 0} {$i < 20} {incr i} { + r arset myarray $i "val$i" + } + assert_equal 20 [r arcount myarray] + + # Use ARRING to trigger truncation + # First set insert_idx to 15, then insert with MOD 10 + r arseek myarray 16 + r arring myarray 10 "wrap" + + # After MOD 10 truncation, only indices 0-9 should exist + # The count should be <= 10 (some original values + new one) + set count [r arcount myarray] + assert_lessthan $count 11 ;# count <= 10 + + # Verify elements >= 10 are gone + assert_equal {} [r arget myarray 10] + assert_equal {} [r arget myarray 15] + assert_equal {} [r arget myarray 19] + } + + test {Regression #5 - AROP MATCH with large strings (>256 bytes)} { + r del myarray + # Create a string larger than 256 bytes + set largestr [string repeat "x" 300] + set largestr2 [string repeat "y" 300] + + r arset myarray 0 $largestr + r arset myarray 1 "small" + r arset myarray 2 $largestr + r arset myarray 3 $largestr2 + + # MATCH should find exactly 2 occurrences of largestr + assert_equal 2 [r arop myarray 0 3 MATCH $largestr] + assert_equal 1 [r arop myarray 0 3 MATCH $largestr2] + assert_equal 1 [r arop myarray 0 3 MATCH "small"] + assert_equal 0 [r arop myarray 0 3 MATCH "notfound"] + } + + test {Regression #6 - DEBUG DIGEST with large strings (>256 bytes)} { + r del myarray + set largestr [string repeat "z" 500] + r arset myarray 0 $largestr + r arset myarray 1 "small" + r arset myarray 100 [string repeat "w" 1000] + + # Get digest - should not crash and should be deterministic + set d1 [r debug digest-value myarray] + set d2 [r debug digest-value myarray] + assert_equal $d1 $d2 "Digest should be deterministic" + + # Modify and verify digest changes + r arset myarray 0 "changed" + set d3 [r debug digest-value myarray] + if {$d1 eq $d3} { + fail "Digest should change after modification" + } + } {} {needs:debug} + + test {Regression #7 - RDB with negative integers including -1} { + r flushdb + # -1 was problematic because it became UINT64_MAX which was RDB_LENERR + r arset myarray 0 -1 + r arset myarray 1 -100 + r arset myarray 2 -9223372036854775808 ;# INT64_MIN as string + r arset myarray 3 0 + r arset myarray 4 1 + r arset myarray 5 9223372036854775807 ;# INT64_MAX as string + + set d1 [r debug digest-value myarray] + + # Save and reload + r bgsave + waitForBgsave r + r debug reload + + # Verify values survived + assert_equal -1 [r arget myarray 0] + assert_equal -100 [r arget myarray 1] + # Note: very large integers may be stored as strings + assert_equal 0 [r arget myarray 3] + assert_equal 1 [r arget myarray 4] + + set d2 [r debug digest-value myarray] + assert_equal $d1 $d2 "Digest should match after RDB reload" + } {} {needs:debug} + + test {Regression #10 - ARSEEK on non-existent key should not create it} { + r del myarray + # ARSEEK on non-existent key + assert_equal 0 [r arseek myarray 100] + + # Key should NOT exist + assert_equal 0 [r exists myarray] + + # Now create the array and verify ARSEEK works + r arinsert myarray "first" + assert_equal 1 [r exists myarray] + + # ARSEEK on existing key should work + assert_equal 1 [r arseek myarray 50] + r arinsert myarray "second" + assert_equal 51 [r arnext myarray] + } + + test {Regression #12 - ARMGET/ARGETRANGE return WRONGTYPE on wrong type} { + r del myarray + r set myarray "string_value" + + # ARMGET should return WRONGTYPE error + assert_error {WRONGTYPE*} {r armget myarray 0 1 2} + + # ARGETRANGE should return WRONGTYPE error + assert_error {WRONGTYPE*} {r argetrange myarray 0 10} + + # Cleanup + r del myarray + } + + test {Regression - RDB preserves exact numeric string forms} { + r flushdb + set values [list \ + 0 "3.141592653589793" \ + 1 "-2.718281828459045" \ + 2 "1.0e-10" \ + 3 "1.0e+100"] + + foreach {idx val} $values { + r arset myarray $idx $val + } + + foreach {idx val} $values { + assert_equal $val [r arget myarray $idx] + } + + # Save and reload + r bgsave + waitForBgsave r + r debug reload + + foreach {idx val} $values { + assert_equal $val [r arget myarray $idx] + } + } {} {needs:debug} + + test {Whole-number floats with .0 suffix encode as inline floats} { + # Values like "1.0" should be encoded as inline floats, not heap strings. + # This tests the ".0" suffix optimization in arTryEncodeFloat. + r del myarray + + # Various whole-number floats that should round-trip with ".0" + r arset myarray 0 1.0 + r arset myarray 1 -1.0 + r arset myarray 2 0.0 + r arset myarray 3 42.0 + r arset myarray 4 -42.0 + r arset myarray 5 1000000.0 + r arset myarray 6 -9999999.0 + + # Verify exact round-trip (the ".0" must be preserved) + assert_equal "1.0" [r arget myarray 0] + assert_equal "-1.0" [r arget myarray 1] + assert_equal "0.0" [r arget myarray 2] + assert_equal "42.0" [r arget myarray 3] + assert_equal "-42.0" [r arget myarray 4] + assert_equal "1000000.0" [r arget myarray 5] + assert_equal "-9999999.0" [r arget myarray 6] + + # Verify these survive RDB save/reload (confirms they're properly encoded) + r bgsave + waitForBgsave r + r debug reload + + assert_equal "1.0" [r arget myarray 0] + assert_equal "-1.0" [r arget myarray 1] + assert_equal "0.0" [r arget myarray 2] + assert_equal "42.0" [r arget myarray 3] + assert_equal "-42.0" [r arget myarray 4] + assert_equal "1000000.0" [r arget myarray 5] + assert_equal "-9999999.0" [r arget myarray 6] + } {} {needs:debug} + + test {Integer values without .0 still encode as integers, not floats} { + # Ensure "1" (without decimal) is encoded as integer, not float + r del myarray + + r arset myarray 0 1 + r arset myarray 1 -1 + r arset myarray 2 0 + r arset myarray 3 42 + r arset myarray 4 9999999 + + # Values without ".0" should stay as integers + assert_equal "1" [r arget myarray 0] + assert_equal "-1" [r arget myarray 1] + assert_equal "0" [r arget myarray 2] + assert_equal "42" [r arget myarray 3] + assert_equal "9999999" [r arget myarray 4] + + # Verify RDB round-trip preserves them as integers + r bgsave + waitForBgsave r + r debug reload + + assert_equal "1" [r arget myarray 0] + assert_equal "-1" [r arget myarray 1] + assert_equal "0" [r arget myarray 2] + assert_equal "42" [r arget myarray 3] + assert_equal "9999999" [r arget myarray 4] + } {} {needs:debug} + + test {AROP on whole-number floats works correctly} { + # Verify AROP aggregation works on values encoded with the .0 optimization + r del myarray + + r arset myarray 0 10.0 + r arset myarray 1 20.0 + r arset myarray 2 30.0 + + # SUM should work on whole-number floats (AROP returns computed values) + assert_equal 60 [r arop myarray 0 2 SUM] + + # MIN/MAX should work + assert_equal 10 [r arop myarray 0 2 MIN] + assert_equal 30 [r arop myarray 0 2 MAX] + + # MATCH should find the encoded values + assert_equal 1 [r arop myarray 0 2 MATCH 10.0] + assert_equal 1 [r arop myarray 0 2 MATCH 20.0] + } + + test {Exact string recovery survives AOF rewrite} { + r flushdb + set longstr [string repeat x 100] + set values [list \ + 0 "1.0" \ + 1 "-1.0" \ + 2 "42.0" \ + 3 "hello" \ + 4 "12345" \ + 5 "-0.0" \ + 6 "0.00" \ + 7 "10.500" \ + 8 "001.25" \ + 9 "1.0e-10" \ + 10 "1.0e+100" \ + 11 $longstr \ + 12 ""] + + foreach {idx val} $values { + r arset myarray $idx $val + } + + foreach {idx val} $values { + assert_equal $val [r arget myarray $idx] + } + + # Trigger AOF rewrite and reload + r bgrewriteaof + waitForBgrewriteaof r + r debug loadaof + + foreach {idx val} $values { + assert_equal $val [r arget myarray $idx] + } + } {} {needs:debug} + + test {Regression - CONFIG GET/SET for array settings} { + # Verify config options exist and are readable + set slice_size [lindex [r config get array-slice-size] 1] + set sparse_kmax [lindex [r config get array-sparse-kmax] 1] + set sparse_kmin [lindex [r config get array-sparse-kmin] 1] + + # Verify defaults + assert_equal 4096 $slice_size + assert_equal 10 $sparse_kmax + assert_equal 5 $sparse_kmin + + # sparse-kmax and sparse-kmin should be modifiable + r config set array-sparse-kmax 20 + assert_equal 20 [lindex [r config get array-sparse-kmax] 1] + r config set array-sparse-kmax $sparse_kmax ;# restore + + r config set array-sparse-kmin 8 + assert_equal 8 [lindex [r config get array-sparse-kmin] 1] + r config set array-sparse-kmin $sparse_kmin ;# restore + + # slice-size is modifiable but must be a power of two + r config set array-slice-size 8192 + assert_equal 8192 [lindex [r config get array-slice-size] 1] + r config set array-slice-size $slice_size ;# restore + + # Non-power-of-two should error + assert_error {*power of two*} {r config set array-slice-size 5000} + } + + test {Arrays created with different slice sizes work after config change} { + # Create an array with current slice size + r del myarray + set orig_size [lindex [r config get array-slice-size] 1] + + # Create array and populate it + for {set i 0} {$i < 10000} {incr i 1000} { + r arset myarray $i "value_$i" + } + set orig_count [r arcount myarray] + + # Change slice size - existing arrays should keep working + r config set array-slice-size 8192 + + # Verify old array still works + assert_equal $orig_count [r arcount myarray] + assert_equal "value_0" [r arget myarray 0] + assert_equal "value_5000" [r arget myarray 5000] + assert_equal "value_9000" [r arget myarray 9000] + + # Create new array with new slice size + r del newarray + r arset newarray 0 "new_value" + assert_equal "new_value" [r arget newarray 0] + + # Restore config + r config set array-slice-size $orig_size + r del myarray + r del newarray + } + + test {Regression - AOF rewrite with superdir mode (high indices)} { + # This tests the fix for AOF rewrite not iterating superdir blocks. + # With slice_size=4096, slice_id 2048 starts at index 8388608. + # Indices >= 8388608 trigger superdir mode. + + r del aoftest + + # Create array with elements that trigger superdir mode + r arset aoftest 0 base + r arset aoftest 8388608 triggers_superdir + r arset aoftest 50000000 high + r arset aoftest 100000000 very_high + + assert_equal 4 [r arcount aoftest] + + # Verify superdir mode is active (directory-size shows number of blocks) + set info [r arinfo aoftest] + set dir_size [dict get $info directory-size] + # With these indices across multiple superdir blocks, dir_size should be > 1 + assert {$dir_size >= 1} + + # Trigger AOF rewrite and reload (same pattern as other AOF tests) + r bgrewriteaof + waitForBgrewriteaof r + r debug loadaof + + # Verify data survived AOF rewrite and reload + assert_equal 4 [r arcount aoftest] + assert_equal "base" [r arget aoftest 0] + assert_equal "triggers_superdir" [r arget aoftest 8388608] + assert_equal "high" [r arget aoftest 50000000] + assert_equal "very_high" [r arget aoftest 100000000] + + assert_equal 1 [r del aoftest] + } {} {needs:debug} + + # ========================================================================= + # Superdir command coverage + # ========================================================================= + + test {ARGETRANGE works across a superdir slice boundary} { + r del myarray + + # Cross slice 2047 -> 2048. Inserting the high index forces the array + # into superdir mode, but the range itself is still short. + r arset myarray 8388607 "left" + r arset myarray 8388608 "mid" + r arset myarray 8388609 "right" + + assert_equal {left mid right} [r argetrange myarray 8388607 8388609] + assert_equal {right mid left} [r argetrange myarray 8388609 8388607] + } + + test {ARSET pre-promotes sparse slice in superdir mode} { + r del myarray + set kmax [lindex [r config get array-sparse-kmax] 1] + assert {$kmax >= 4} + + # Build a sparse slice with kmax-1 existing elements at even offsets. + # The later range write covers offsets 0..kmax-1, so some of these + # positions are already filled and some are new. + for {set i 0} {$i < $kmax - 1} {incr i} { + set off [expr {$i * 2}] + r arset myarray [expr {8388608 + $off}] "old$off" + } + + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + + # The range has kmax slots, while the slice already contains kmax-1 + # elements spread across the slice. This keeps range_size <= kmax, so + # the helper must take the count+new_elements path in order to decide + # the promotion. + set values {} + set existing_in_range 0 + for {set off 0} {$off < $kmax} {incr off} { + lappend values "n$off" + if {$off % 2 == 0 && $off <= 2 * ($kmax - 2)} { + incr existing_in_range + } + } + set expected_new [expr {$kmax - $existing_in_range}] + assert_equal $expected_new [r arset myarray 8388608 {*}$values] + + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert_equal 0 [dict get $info sparse-slices] + assert_equal $values [r argetrange myarray 8388608 [expr {8388608 + $kmax - 1}]] + assert_equal "old[expr {2 * ($kmax - 2)}]" [r arget myarray [expr {8388608 + 2 * ($kmax - 2)}]] + } + + # ========================================================================= + # Range delete + iterator tests (dense→sparse demotion, superdir, sparse) + # ========================================================================= + + test {ARDELRANGE triggers dense to sparse demotion} { + r del myarray + # Pin config to ensure test doesn't break if defaults change + set orig_kmin [lindex [r config get array-sparse-kmin] 1] + r config set array-sparse-kmin 5 + + # Create a dense slice with 50 elements + for {set i 0} {$i < 50} {incr i} { + r arset myarray $i "val$i" + } + assert_equal 50 [r arcount myarray] + + # Verify it's dense + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + assert_equal 0 [dict get $info sparse-slices] + + # Delete most elements with ARDELRANGE, leaving only 3 (below kmin=5) + assert_equal 47 [r ardelrange myarray 3 49] + assert_equal 3 [r arcount myarray] + + # Verify demotion to sparse + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + + # Verify remaining elements + assert_equal "val0" [r arget myarray 0] + assert_equal "val1" [r arget myarray 1] + assert_equal "val2" [r arget myarray 2] + + r config set array-sparse-kmin $orig_kmin + } + + test {ARDELRANGE partial delete preserves dense then demotes} { + r del myarray + # Pin config + set orig_kmin [lindex [r config get array-sparse-kmin] 1] + r config set array-sparse-kmin 5 + + # Create dense slice + for {set i 0} {$i < 40} {incr i} { + r arset myarray $i $i + } + + # Delete some but not enough to trigger demotion (keep 10 > kmin=5) + assert_equal 30 [r ardelrange myarray 10 39] + assert_equal 10 [r arcount myarray] + + set info [r arinfo myarray FULL] + assert_equal 1 [dict get $info dense-slices] + + # Now delete more to trigger demotion + assert_equal 6 [r ardelrange myarray 4 9] + assert_equal 4 [r arcount myarray] + + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + + r config set array-sparse-kmin $orig_kmin + } + + test {ARDELRANGE deletes full slices within superdir block} { + r del myarray + # With slice_size=4096: + # - Slice 2048 starts at index 8388608 + # - Slice 2049 starts at index 8392704 + # - Both are in superdir block 1 + + # Create elements in two adjacent slices within same superdir block + r arset myarray 8388608 "slice2048_a" + r arset myarray 8388700 "slice2048_b" + r arset myarray 8392704 "slice2049_a" + r arset myarray 8392800 "slice2049_b" + # And one element in a different block for reference + r arset myarray 0 "slice0" + + assert_equal 5 [r arcount myarray] + + # Delete range that fully covers both slices 2048 and 2049 + # This should trigger full-slice deletion (not element-by-element) + assert_equal 4 [r ardelrange myarray 8388608 8396799] + assert_equal 1 [r arcount myarray] + + # Verify only slice0 element remains + assert_equal "slice0" [r arget myarray 0] + assert_equal {} [r arget myarray 8388608] + assert_equal {} [r arget myarray 8392704] + + r del myarray + } + + test {ARDELRANGE spanning multiple superdir blocks} { + r del myarray + # Superdir block boundaries with slice_size=4096: + # - Block 0: slices 0-2047 (indices 0 - 8388607) + # - Block 1: slices 2048-4095 (indices 8388608 - 16777215) + # - Block 2: slices 4096+ (indices 16777216+) + + # Create elements across three blocks + r arset myarray 100 "block0" + r arset myarray 8388608 "block1_start" + r arset myarray 12000000 "block1_mid" + r arset myarray 16777200 "block1_end" + r arset myarray 16777216 "block2_start" + r arset myarray 20000000 "block2_mid" + + assert_equal 6 [r arcount myarray] + + # Delete range spanning from block1 into block2 + # This exercises cross-block deletion + assert_equal 4 [r ardelrange myarray 8388608 18000000] + assert_equal 2 [r arcount myarray] + + # Verify block0 and remaining block2 element + assert_equal "block0" [r arget myarray 100] + assert_equal "block2_mid" [r arget myarray 20000000] + assert_equal {} [r arget myarray 8388608] + assert_equal {} [r arget myarray 16777216] + + r del myarray + } + + test {ARDELRANGE superdir middle range with missing upper block} { + r del myarray + # Occupied blocks: + # - block 0: boundary lo_slice + # - block 1: middle full slices to delete + # - block 3: boundary hi_slice + # block 2 is intentionally empty, so the upper lower-bound search + # must stop at the insertion point rather than on an exact match. + r arset myarray 8388590 "block0_keep" + r arset myarray 8388608 "block1_a" + r arset myarray 8392704 "block1_b" + r arset myarray 25165825 "block3_keep" + + assert_equal 4 [r arcount myarray] + assert_equal 2 [r ardelrange myarray 8388595 25165824] + assert_equal 2 [r arcount myarray] + + assert_equal "block0_keep" [r arget myarray 8388590] + assert_equal {} [r arget myarray 8388608] + assert_equal {} [r arget myarray 8392704] + assert_equal "block3_keep" [r arget myarray 25165825] + } + + test {ARDELRANGE superdir with empty middle block interval} { + r del myarray + # Only the boundary slices are populated. The superdir middle interval + # is empty, so the block loop must resolve to [start, end) = empty. + r arset myarray 8388590 "block0_keep" + r arset myarray 8388607 "block0_del" + r arset myarray 25165824 "block3_del" + r arset myarray 25165825 "block3_keep" + + assert_equal 4 [r arcount myarray] + assert_equal 2 [r ardelrange myarray 8388600 25165824] + assert_equal 2 [r arcount myarray] + + assert_equal "block0_keep" [r arget myarray 8388590] + assert_equal {} [r arget myarray 8388607] + assert_equal {} [r arget myarray 25165824] + assert_equal "block3_keep" [r arget myarray 25165825] + } + + test {ARDELRANGE with multiple ranges in single call} { + r del myarray + for {set i 0} {$i < 20} {incr i} { + r arset myarray $i "val$i" + } + assert_equal 20 [r arcount myarray] + + # Delete two separate ranges in one command + # Ranges: [2,4] and [10,14] + assert_equal 8 [r ardelrange myarray 2 4 10 14] + assert_equal 12 [r arcount myarray] + + # Verify correct elements deleted + assert_equal "val0" [r arget myarray 0] + assert_equal "val1" [r arget myarray 1] + assert_equal {} [r arget myarray 2] + assert_equal {} [r arget myarray 3] + assert_equal {} [r arget myarray 4] + assert_equal "val5" [r arget myarray 5] + assert_equal "val9" [r arget myarray 9] + assert_equal {} [r arget myarray 10] + assert_equal {} [r arget myarray 14] + assert_equal "val15" [r arget myarray 15] + } + + test {ARDELRANGE with overlapping ranges} { + r del myarray + for {set i 0} {$i < 20} {incr i} { + r arset myarray $i "val$i" + } + + # Overlapping ranges: [5,12] and [8,15] + # Should delete [5,15] total = 11 elements + # But second range re-deletes already-deleted [8,12], so still 11 unique + assert_equal 11 [r ardelrange myarray 5 12 8 15] + assert_equal 9 [r arcount myarray] + + assert_equal "val4" [r arget myarray 4] + assert_equal {} [r arget myarray 5] + assert_equal {} [r arget myarray 12] + assert_equal {} [r arget myarray 15] + assert_equal "val16" [r arget myarray 16] + } + + test {ARDELRANGE sparse slice middle-span deletion} { + r del myarray + # Create sparse slice with specific offsets + r arset myarray 10 "a" + r arset myarray 20 "b" + r arset myarray 30 "c" + r arset myarray 40 "d" + r arset myarray 50 "e" + + assert_equal 5 [r arcount myarray] + + # Delete a middle contiguous sparse span. + assert_equal 3 [r ardelrange myarray 20 40] + assert_equal 2 [r arcount myarray] + + # Verify correct elements remain + assert_equal "a" [r arget myarray 10] + assert_equal {} [r arget myarray 20] + assert_equal {} [r arget myarray 30] + assert_equal {} [r arget myarray 40] + assert_equal "e" [r arget myarray 50] + } + + test {ARDELRANGE sparse with non-contiguous deletions} { + r del myarray + # Sparse elements at various offsets + r arset myarray 5 "v5" + r arset myarray 15 "v15" + r arset myarray 25 "v25" + r arset myarray 35 "v35" + r arset myarray 45 "v45" + + # Delete range that only hits some elements + assert_equal 2 [r ardelrange myarray 10 30] + assert_equal 3 [r arcount myarray] + + assert_equal "v5" [r arget myarray 5] + assert_equal {} [r arget myarray 15] + assert_equal {} [r arget myarray 25] + assert_equal "v35" [r arget myarray 35] + assert_equal "v45" [r arget myarray 45] + } + + test {ARDELRANGE sparse prefix span deletion} { + r del myarray + r arset myarray 10 "a" + r arset myarray 20 "b" + r arset myarray 30 "c" + r arset myarray 40 "d" + r arset myarray 50 "e" + + # Delete the sparse prefix span: first == 0, last in the middle. + assert_equal 2 [r ardelrange myarray 0 25] + assert_equal 3 [r arcount myarray] + + assert_equal {} [r arget myarray 10] + assert_equal {} [r arget myarray 20] + assert_equal "c" [r arget myarray 30] + assert_equal "d" [r arget myarray 40] + assert_equal "e" [r arget myarray 50] + } + + test {ARDELRANGE sparse suffix span deletion} { + r del myarray + r arset myarray 10 "a" + r arset myarray 20 "b" + r arset myarray 30 "c" + r arset myarray 40 "d" + r arset myarray 50 "e" + + # Delete the sparse suffix span: first in the middle, last == count. + assert_equal 2 [r ardelrange myarray 35 100] + assert_equal 3 [r arcount myarray] + + assert_equal "a" [r arget myarray 10] + assert_equal "b" [r arget myarray 20] + assert_equal "c" [r arget myarray 30] + assert_equal {} [r arget myarray 40] + assert_equal {} [r arget myarray 50] + } + + test {ARDELRANGE sparse whole-slice deletion} { + r del myarray + r arset myarray 10 "a" + r arset myarray 20 "b" + r arset myarray 30 "c" + r arset myarray 40 "d" + r arset myarray 50 "e" + + # Delete the whole sparse slice: first == 0, last == count. + assert_equal 5 [r ardelrange myarray 0 100] + assert_equal 0 [r exists myarray] + } + + test {ARDELRANGE sparse no-hit range} { + r del myarray + r arset myarray 10 "a" + r arset myarray 20 "b" + r arset myarray 30 "c" + r arset myarray 40 "d" + r arset myarray 50 "e" + + # Delete a range that falls strictly between two sparse offsets. + assert_equal 0 [r ardelrange myarray 11 19] + assert_equal 5 [r arcount myarray] + + assert_equal "a" [r arget myarray 10] + assert_equal "b" [r arget myarray 20] + assert_equal "c" [r arget myarray 30] + assert_equal "d" [r arget myarray 40] + assert_equal "e" [r arget myarray 50] + } + + test {ARDELRANGE sparse single edge deletions} { + r del myarray + r arset myarray 10 "a" + r arset myarray 20 "b" + r arset myarray 30 "c" + r arset myarray 40 "d" + r arset myarray 50 "e" + + # Delete exactly the first sparse element, then exactly the last one. + assert_equal 1 [r ardelrange myarray 10 10] + assert_equal 4 [r arcount myarray] + assert_equal {} [r arget myarray 10] + assert_equal "b" [r arget myarray 20] + assert_equal "c" [r arget myarray 30] + assert_equal "d" [r arget myarray 40] + assert_equal "e" [r arget myarray 50] + + assert_equal 1 [r ardelrange myarray 50 50] + assert_equal 3 [r arcount myarray] + assert_equal "b" [r arget myarray 20] + assert_equal "c" [r arget myarray 30] + assert_equal "d" [r arget myarray 40] + assert_equal {} [r arget myarray 50] + } + + test {Random testing - blackbox ARDELRANGE model stress} { + r flushdb + expr {srand(24680)} + array set model_state {} + + for {set step 0} {$step < 400} {incr step} { + set roll [expr {int(rand() * 100)}] + + if {$roll < 50} { + set idx [random_array_index] + set val [random_value] + r arset myarray $idx $val + set model_state($idx) $val + } elseif {$roll < 70} { + set idx [random_array_index] + set expected_deleted 0 + if {[info exists model_state($idx)]} { + unset model_state($idx) + set expected_deleted 1 + } + assert_equal $expected_deleted [r ardel myarray $idx] + } else { + set args {} + set expected_deleted 0 + set nranges [expr {int(rand() * 3) + 1}] + + for {set i 0} {$i < $nranges} {incr i} { + set lo [random_array_index] + set hi [random_array_index] + lappend args $lo $hi + incr expected_deleted [model_array_delrange model_state $lo $hi] + } + + assert_equal $expected_deleted [r ardelrange myarray {*}$args] + } + + if {$step % 25 == 0 || $step == 399} { + set expected_scan [model_array_scan model_state] + set expected_count [array size model_state] + + if {$expected_count == 0} { + assert_equal 0 [r exists myarray] + assert_equal {} [r arscan myarray 0 30000000] + } else { + assert_equal $expected_count [r arcount myarray] + assert_equal $expected_scan [r arscan myarray 0 30000000] + } + + for {set probe 0} {$probe < 20} {incr probe} { + set idx [random_array_index] + if {[info exists model_state($idx)]} { + assert_equal $model_state($idx) [r arget myarray $idx] + } else { + assert_equal {} [r arget myarray $idx] + } + } + } + } + } + + test {ARSCAN after ARDELRANGE with demotion} { + r del myarray + # Create dense + for {set i 0} {$i < 30} {incr i} { + r arset myarray $i "val$i" + } + + # Delete most, triggering demotion + r ardelrange myarray 4 29 + + # ARSCAN should find remaining elements + set result [r arscan myarray 0 100] + assert_equal 4 [llength $result] + assert_equal {{0 val0} {1 val1} {2 val2} {3 val3}} $result + + # Reverse scan + set result [r arscan myarray 100 0] + assert_equal {{3 val3} {2 val2} {1 val1} {0 val0}} $result + } + + test {ARSCAN with LIMIT after range delete} { + r del myarray + for {set i 0} {$i < 20} {incr i} { + r arset myarray $i $i + } + + # Delete some in the middle + r ardelrange myarray 5 14 + + # Scan with limit + set result [r arscan myarray 0 100 LIMIT 3] + assert_equal 3 [llength $result] + assert_equal {{0 0} {1 1} {2 2}} $result + } + + test {AROP after ARDELRANGE across multiple slices} { + r del myarray + # Create elements across slice boundaries (slice_size=4096) + for {set i 0} {$i < 10} {incr i} { + r arset myarray $i $i + } + for {set i 4096} {$i < 4106} {incr i} { + r arset myarray $i $i + } + + assert_equal 20 [r arcount myarray] + + # Delete first slice partially + r ardelrange myarray 5 9 + + # AROP SUM should work across slices + # Remaining: 0+1+2+3+4 + 4096..4105 = 10 + sum(4096..4105) + # sum(4096..4105) = (4096+4105)*10/2 = 41005 + set sum [r arop myarray 0 5000 SUM] + assert_equal 41015 $sum + + # AROP USED + assert_equal 15 [r arop myarray 0 5000 USED] + + # AROP MIN/MAX + assert_equal 0 [r arop myarray 0 5000 MIN] + assert_equal 4105 [r arop myarray 0 5000 MAX] + } + + test {AROP MATCH after dense demotion} { + r del myarray + # Create dense with repeated values + for {set i 0} {$i < 30} {incr i} { + r arset myarray $i "target" + } + r arset myarray 2 "other" + + # Delete most to trigger demotion, keep indices 0-3 + # After delete: 0=target, 1=target, 2=other, 3=target + r ardelrange myarray 4 29 + + # Verify demotion happened + set info [r arinfo myarray FULL] + assert_equal 0 [dict get $info dense-slices] + assert_equal 1 [dict get $info sparse-slices] + + # Count matches in sparse slice (3 "target" values) + assert_equal 3 [r arop myarray 0 100 MATCH target] + } + + test {ARSCAN over superdir blocks} { + r del myarray + # Elements in different superdir blocks + r arset myarray 0 "first" + r arset myarray 8388608 "second" + r arset myarray 16777216 "third" + + # Scan entire range + set result [r arscan myarray 0 20000000] + assert_equal 3 [llength $result] + assert_equal {0 first} [lindex $result 0] + assert_equal {8388608 second} [lindex $result 1] + assert_equal {16777216 third} [lindex $result 2] + + # Reverse scan + set result [r arscan myarray 20000000 0] + assert_equal {16777216 third} [lindex $result 0] + assert_equal {8388608 second} [lindex $result 1] + assert_equal {0 first} [lindex $result 2] + + r del myarray + } + + test {Iterator commands do not rescan exhausted superdir blocks} { + r del myarray + r arset myarray 43 "a" + r arset myarray 4586 "b" + r arset myarray 19245258 "c" + + assert_equal {{43 a} {4586 b} {19245258 c}} \ + [r arscan myarray 0 30000000 LIMIT 8] + assert_equal {{19245258 c}} \ + [r argrep myarray 0 30000000 EXACT c WITHVALUES LIMIT 4] + assert_equal 3 [r arop myarray 0 30000000 USED] + } + + test {AROP over superdir with partial range} { + r del myarray + r arset myarray 0 10 + r arset myarray 100 20 + r arset myarray 8388608 30 + r arset myarray 8388700 40 + r arset myarray 16777216 50 + + # SUM only in first block + assert_equal 30 [r arop myarray 0 1000 SUM] + + # SUM spanning blocks + assert_equal 150 [r arop myarray 0 20000000 SUM] + + # USED in specific range + assert_equal 2 [r arop myarray 8388600 8388800 USED] + + r del myarray + } + + test {ARDELRANGE delete entire slice then verify iteration} { + r del myarray + # Two slices + for {set i 0} {$i < 10} {incr i} { + r arset myarray $i "slice0_$i" + } + for {set i 4096} {$i < 4106} {incr i} { + r arset myarray $i "slice1_$i" + } + + # Delete entire first slice + assert_equal 10 [r ardelrange myarray 0 4095] + assert_equal 10 [r arcount myarray] + + # ARSCAN should only find second slice elements + set result [r arscan myarray 0 5000] + assert_equal 10 [llength $result] + assert_equal {4096 slice1_4096} [lindex $result 0] + } + +} + +# Test loading a 32-bit generated RDB on the current architecture. +# The RDB file contains arrays exercising all tagged pointer encodings: +# immediate ints (including 30-bit boundary values), inline floats, +# small strings, arString heap strings, mixed types, sparse indices, +# and insert_idx preservation. +set server_path [tmpdir "server.array-32bit-rdb-test"] +exec cp [file join [pwd] tests/assets/array-32bit.rdb] $server_path + +start_server [list overrides [list "dir" $server_path "dbfilename" "array-32bit.rdb"] tags {"array external:skip"}] { + + test {Load 32-bit RDB - integer encodings} { + r select 0 + # Inline ints and boundary values + assert_equal 0 [r arget ints 0] + assert_equal 1 [r arget ints 1] + assert_equal -1 [r arget ints 2] + assert_equal 42 [r arget ints 3] + assert_equal -42 [r arget ints 4] + # 30-bit int boundary (max/min for 32-bit tagged ints) + assert_equal 536870911 [r arget ints 5] + assert_equal -536870912 [r arget ints 6] + # Values beyond 30-bit range (arString on 32-bit, re-encoded on load) + assert_equal 536870912 [r arget ints 7] + assert_equal -536870913 [r arget ints 8] + assert_equal 2147483647 [r arget ints 9] + assert_equal -2147483648 [r arget ints 10] + assert_equal 1000000000 [r arget ints 11] + assert_equal 999999999 [r arget ints 12] + assert_equal 100 [r arget ints 13] + assert_equal 14 [r arcount ints] + } + + test {Load 32-bit RDB - float encodings} { + r select 0 + assert_equal 1.0 [r arget floats 0] + assert_equal -1.0 [r arget floats 1] + assert_equal 3.14 [r arget floats 2] + assert_equal 0.5 [r arget floats 3] + assert_equal -0.5 [r arget floats 4] + assert_equal 0.25 [r arget floats 5] + assert_equal 100.0 [r arget floats 6] + assert_equal -100.0 [r arget floats 7] + assert_equal 1.5 [r arget floats 8] + assert_equal 1.75 [r arget floats 9] + assert_equal 0.1 [r arget floats 10] + assert_equal 1234.5 [r arget floats 11] + assert_equal 0.0625 [r arget floats 12] + assert_equal 999999.0 [r arget floats 13] + assert_equal 1.23456789012 [r arget floats 14] + assert_equal 15 [r arcount floats] + } + + test {Load 32-bit RDB - string encodings} { + r select 0 + # Empty string, 1-3 byte inline (smallstr on 32-bit), + # 4-7 byte (smallstr on 64-bit only, arString on 32-bit), + # 8+ byte (always arString) + assert_equal {} [r arget strs 0] + assert_equal a [r arget strs 1] + assert_equal ab [r arget strs 2] + assert_equal abc [r arget strs 3] + assert_equal abcd [r arget strs 4] + assert_equal abcde [r arget strs 5] + assert_equal abcdef [r arget strs 6] + assert_equal abcdefg [r arget strs 7] + assert_equal abcdefgh [r arget strs 8] + assert_equal {hello world} [r arget strs 9] + assert_equal {this is a longer string for testing} [r arget strs 10] + assert_equal x [r arget strs 11] + assert_equal xy [r arget strs 12] + assert_equal xyz [r arget strs 13] + assert_equal 14 [r arcount strs] + } + + test {Load 32-bit RDB - mixed type encodings} { + r select 0 + assert_equal 42 [r arget mixed 0] + assert_equal 3.14 [r arget mixed 1] + assert_equal hi [r arget mixed 2] + assert_equal -536870912 [r arget mixed 3] + assert_equal 0.5 [r arget mixed 4] + assert_equal abcdefghij [r arget mixed 5] + assert_equal 536870911 [r arget mixed 6] + assert_equal -1.5 [r arget mixed 7] + assert_equal ab [r arget mixed 8] + assert_equal 0 [r arget mixed 9] + assert_equal 1.0 [r arget mixed 10] + assert_equal hello [r arget mixed 11] + assert_equal 2147483647 [r arget mixed 12] + assert_equal 0.25 [r arget mixed 13] + assert_equal xyz [r arget mixed 14] + assert_equal 15 [r arcount mixed] + } + + test {Load 32-bit RDB - sparse indices across slices} { + r select 0 + assert_equal first [r arget sparse 0] + assert_equal slice0end [r arget sparse 4095] + assert_equal slice1start [r arget sparse 4096] + assert_equal slice1end [r arget sparse 8191] + assert_equal 42 [r arget sparse 10000] + assert_equal 3.14 [r arget sparse 50000] + assert_equal hello [r arget sparse 100000] + assert_equal 7 [r arcount sparse] + } + + test {Load 32-bit RDB - insert_idx preservation} { + r select 0 + assert_equal one [r arget withinsert 0] + assert_equal two [r arget withinsert 1] + assert_equal three [r arget withinsert 2] + assert_equal four [r arget withinsert 3] + assert_equal five [r arget withinsert 4] + assert_equal 5 [r arcount withinsert] + # Verify insert_idx was preserved: next insert should go at index 5 + r arinsert withinsert six + assert_equal six [r arget withinsert 5] + } + + test {Load 32-bit RDB - re-save and reload cycle} { + r select 0 + # Save from 64-bit, reload, verify integrity + r save + r debug reload + foreach {idx value} { + 0 0 1 1 2 -1 3 42 4 -42 + 5 536870911 6 -536870912 7 536870912 8 -536870913 + 9 2147483647 10 -2147483648 11 1000000000 12 999999999 13 100 + } { + assert_equal $value [r arget ints $idx] + } + assert_equal 14 [r arcount ints] + + foreach {idx value} { + 0 1.0 1 -1.0 2 3.14 3 0.5 4 -0.5 + 5 0.25 6 100.0 7 -100.0 8 1.5 9 1.75 + 10 0.1 11 1234.5 12 0.0625 13 999999.0 14 1.23456789012 + } { + assert_equal $value [r arget floats $idx] + } + assert_equal 15 [r arcount floats] + + foreach {idx value} { + 0 {} 1 a 2 ab 3 abc 4 abcd 5 abcde 6 abcdef 7 abcdefg + 8 abcdefgh 9 {hello world} 10 {this is a longer string for testing} + 11 x 12 xy 13 xyz + } { + assert_equal $value [r arget strs $idx] + } + assert_equal 14 [r arcount strs] + + foreach {idx value} { + 0 42 1 3.14 2 hi 3 -536870912 4 0.5 + 5 abcdefghij 6 536870911 7 -1.5 8 ab 9 0 + 10 1.0 11 hello 12 2147483647 13 0.25 14 xyz + } { + assert_equal $value [r arget mixed $idx] + } + assert_equal 15 [r arcount mixed] + + foreach {idx value} { + 0 first 4095 slice0end 4096 slice1start 8191 slice1end + 10000 42 50000 3.14 100000 hello + } { + assert_equal $value [r arget sparse $idx] + } + assert_equal 7 [r arcount sparse] + + foreach {idx value} { + 0 one 1 two 2 three 3 four 4 five 5 six + } { + assert_equal $value [r arget withinsert $idx] + } + assert_equal 6 [r arcount withinsert] + r arinsert withinsert seven + assert_equal seven [r arget withinsert 6] + } {} {needs:debug} +} diff --git a/tools/array-bench.py b/tools/array-bench.py new file mode 100755 index 000000000..959e12961 --- /dev/null +++ b/tools/array-bench.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import re +import signal +import subprocess +import sys +import tempfile +import time +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Optional + + +QPS_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s+requests per second") + + +@dataclass +class Workload: + name: str + description: str + command: list[str] + requests: int + clients: int + pipeline: int + rand_range: int = 0 + warmup_requests: int = 2000 + setup: Optional[str] = None + + +@dataclass +class Result: + name: str + description: str + qps: float + requests: int + clients: int + pipeline: int + rand_range: int + command: list[str] + raw_output: str + + +class BenchError(RuntimeError): + pass + + +class RedisArrayBench: + def __init__(self, args: argparse.Namespace): + self.args = args + self.base_dir = Path(__file__).resolve().parent + repo_root = self.base_dir.parent + src_dir = Path(args.src_dir) if args.src_dir else repo_root / "src" + self.redis_server = str(src_dir / "redis-server") + self.redis_cli = str(src_dir / "redis-cli") + self.redis_benchmark = str(src_dir / "redis-benchmark") + self.server_proc: Optional[subprocess.Popen[str]] = None + self.server_dir: Optional[tempfile.TemporaryDirectory[str]] = None + self.host = args.host + self.port = args.port + self.db = args.db + self.results: list[Result] = [] + + for binary in (self.redis_server, self.redis_cli, self.redis_benchmark): + if not os.path.exists(binary): + raise BenchError(f"missing binary: {binary}") + + def run(self) -> int: + try: + if self.args.start_server: + self.start_server() + self.prepare_data() + self.print_dataset_summary() + for workload in self.selected_workloads(): + result = self.run_workload(workload) + self.results.append(result) + print(f"{result.name:28s} {result.qps:12.2f} req/s") + self.print_summary() + if self.args.json_out: + with open(self.args.json_out, "w", encoding="utf-8") as fp: + json.dump({ + "host": self.host, + "port": self.port, + "db": self.db, + "results": [asdict(r) for r in self.results], + }, fp, indent=2) + print(f"json written to {self.args.json_out}") + return 0 + finally: + if self.args.start_server and not self.args.keep_server: + self.stop_server() + + def start_server(self) -> None: + self.server_dir = tempfile.TemporaryDirectory(prefix="array-bench-") + cmd = [ + self.redis_server, + "--port", str(self.port), + "--save", "", + "--appendonly", "no", + "--dir", self.server_dir.name, + "--loglevel", "warning", + "--daemonize", "no", + ] + self.server_proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + self.wait_for_ping(timeout=10.0) + + def stop_server(self) -> None: + if self.server_proc is not None and self.server_proc.poll() is None: + self.server_proc.send_signal(signal.SIGTERM) + try: + self.server_proc.wait(timeout=5) + except subprocess.TimeoutExpired: + self.server_proc.kill() + self.server_proc.wait(timeout=5) + if self.server_dir is not None: + self.server_dir.cleanup() + self.server_proc = None + self.server_dir = None + + def wait_for_ping(self, timeout: float) -> None: + deadline = time.time() + timeout + last_error = None + while time.time() < deadline: + if self.server_proc is not None and self.server_proc.poll() is not None: + raise BenchError( + "server exited before becoming ready:\n" + f"{self.read_server_output().strip()}" + ) + try: + cmd = [ + self.redis_cli, + "-h", self.host, + "-p", str(self.port), + "-n", str(self.db), + "--raw", + "PING", + ] + probe = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if probe.returncode != 0: + raise BenchError(probe.stderr.strip() or probe.stdout.strip()) + out = probe.stdout.strip() + if out == "PONG": + return + except Exception as exc: # pragma: no cover - startup race handling + last_error = exc + time.sleep(0.05) + raise BenchError( + f"server did not start on {self.host}:{self.port}: {last_error}\n" + f"{self.read_server_output().strip()}" + ) + + def read_server_output(self) -> str: + if self.server_proc is None or self.server_proc.stdout is None: + return "" + try: + return self.server_proc.stdout.read() + except Exception: # pragma: no cover - best effort diagnostics + return "" + + def cli(self, command: list[str], raw: bool = False) -> str: + cmd = [self.redis_cli, "-h", self.host, "-p", str(self.port), "-n", str(self.db)] + if raw: + cmd.append("--raw") + cmd.extend(command) + return subprocess.check_output(cmd, text=True) + + def pipe(self, payload: bytes) -> None: + cmd = [self.redis_cli, "-h", self.host, "-p", str(self.port), "-n", str(self.db), "--pipe"] + proc = subprocess.run(cmd, input=payload, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if proc.returncode != 0: + raise BenchError(f"redis-cli --pipe failed:\n{proc.stdout.decode('utf-8', 'replace')}") + out = proc.stdout.decode("utf-8", "replace") + if "errors: 0, replies:" not in out: + raise BenchError(f"unexpected --pipe output:\n{out}") + + @staticmethod + def resp(parts: list[str]) -> bytes: + out = [f"*{len(parts)}\r\n".encode()] + for part in parts: + data = part.encode("utf-8") + out.append(f"${len(data)}\r\n".encode()) + out.append(data) + out.append(b"\r\n") + return b"".join(out) + + def prepare_data(self) -> None: + print("preparing datasets...", file=sys.stderr) + self.cli(["FLUSHDB"]) + payload = bytearray() + payload += self.resp(["DEL", "bench:array:dense:num", "bench:array:dense:text", "bench:array:sparse:text", "bench:array:append", "bench:array:ring"]) + payload += self.build_dense_numeric() + payload += self.build_dense_text() + payload += self.build_sparse_text() + self.pipe(bytes(payload)) + + def build_dense_numeric(self) -> bytes: + key = "bench:array:dense:num" + total = self.args.dense_len + batch = 256 + payload = bytearray() + for start in range(0, total, batch): + values = [str(start + i) for i in range(min(batch, total - start))] + payload += self.resp(["ARSET", key, str(start), *values]) + return bytes(payload) + + def build_dense_text(self) -> bytes: + key = "bench:array:dense:text" + total = self.args.dense_len + batch = 128 + payload = bytearray() + for start in range(0, total, batch): + values = [] + for i in range(start, min(start + batch, total)): + mod = i % 4 + if mod == 0: + values.append(f"row:{i} alpha encoding complexity") + elif mod == 1: + values.append(f"row:{i} beta sparse vector") + elif mod == 2: + values.append(f"row:{i} gamma dense matcher") + else: + values.append(f"row:{i} delta encoding helper") + payload += self.resp(["ARSET", key, str(start), *values]) + return bytes(payload) + + def build_sparse_text(self) -> bytes: + key = "bench:array:sparse:text" + clusters = [ + (0, 97, 384), + (8_388_608, 113, 640), + (16_777_216, 127, 896), + (25_165_824, 151, 896), + ] + batch_pairs = 64 + pairs: list[str] = [] + payload = bytearray() + nth = 0 + for base, stride, count in clusters: + for i in range(count): + idx = base + i * stride + mod = nth % 4 + if mod == 0: + value = f"slot:{idx} alpha encoding complexity" + elif mod == 1: + value = f"slot:{idx} beta sparse needle" + elif mod == 2: + value = f"slot:{idx} gamma dense helper" + else: + value = f"slot:{idx} delta complexity marker" + pairs.extend([str(idx), value]) + nth += 1 + if len(pairs) >= batch_pairs * 2: + payload += self.resp(["ARMSET", key, *pairs]) + pairs.clear() + if pairs: + payload += self.resp(["ARMSET", key, *pairs]) + return bytes(payload) + + def print_dataset_summary(self) -> None: + summary = { + "bench:array:dense:num": { + "count": self.cli(["ARCOUNT", "bench:array:dense:num"], raw=True).strip(), + "len": self.cli(["ARLEN", "bench:array:dense:num"], raw=True).strip(), + }, + "bench:array:dense:text": { + "count": self.cli(["ARCOUNT", "bench:array:dense:text"], raw=True).strip(), + "len": self.cli(["ARLEN", "bench:array:dense:text"], raw=True).strip(), + }, + "bench:array:sparse:text": { + "count": self.cli(["ARCOUNT", "bench:array:sparse:text"], raw=True).strip(), + "len": self.cli(["ARLEN", "bench:array:sparse:text"], raw=True).strip(), + }, + } + print("dataset:") + for key, info in summary.items(): + print(f" {key}: count={info['count']} len={info['len']}") + + def selected_workloads(self) -> list[Workload]: + workloads = self.workloads() + if not self.args.only: + return workloads + wanted = {name.strip() for name in self.args.only.split(",") if name.strip()} + unknown = wanted - {w.name for w in workloads} + if unknown: + raise BenchError(f"unknown workload(s): {', '.join(sorted(unknown))}") + return [w for w in workloads if w.name in wanted] + + def workloads(self) -> list[Workload]: + dense_range_end = min(8192 + 31, self.args.dense_len - 1) + return [ + Workload("arget_dense_rand", "ARGET dense random hit", ["ARGET", "bench:array:dense:num", "__rand_int__"], 200_000, 50, 16, rand_range=self.args.dense_len), + Workload("armget_dense_4_rand", "ARMGET dense 4 random hits", ["ARMGET", "bench:array:dense:num", "__rand_int__", "__rand_int__", "__rand_int__", "__rand_int__"], 100_000, 50, 16, rand_range=self.args.dense_len), + Workload("argetrange_dense_32", "ARGETRANGE dense 32 hot", ["ARGETRANGE", "bench:array:dense:num", "8192", str(dense_range_end)], 50_000, 32, 8), + Workload("arscan_dense_limit_100", "ARSCAN dense LIMIT 100", ["ARSCAN", "bench:array:dense:text", "0", str(self.args.dense_len - 1), "LIMIT", "100"], 50_000, 24, 4), + Workload("argrep_match_dense", "ARGREP MATCH dense", ["ARGREP", "bench:array:dense:text", "0", str(self.args.dense_len - 1), "MATCH", "encoding", "LIMIT", "20", "WITHVALUES"], 20_000, 20, 2), + Workload("argrep_re_dense_nocase", "ARGREP RE dense nocase", ["ARGREP", "bench:array:dense:text", "0", str(self.args.dense_len - 1), "RE", "encoding|complexity|helper", "NOCASE", "LIMIT", "20", "WITHVALUES"], 20_000, 20, 2), + Workload("arop_sum_dense_4096", "AROP SUM dense 4096", ["AROP", "bench:array:dense:num", "0", "4095", "SUM"], 50_000, 24, 4), + Workload("arget_sparse_rand", "ARGET sparse random mostly miss", ["ARGET", "bench:array:sparse:text", "__rand_int__"], 200_000, 50, 16, rand_range=self.args.sparse_space), + Workload("arscan_sparse_limit_100", "ARSCAN sparse LIMIT 100", ["ARSCAN", "bench:array:sparse:text", "0", str(self.args.sparse_space - 1), "LIMIT", "100"], 25_000, 20, 2), + Workload("argrep_match_sparse", "ARGREP MATCH sparse", ["ARGREP", "bench:array:sparse:text", "0", str(self.args.sparse_space - 1), "MATCH", "encoding", "LIMIT", "20", "WITHVALUES"], 10_000, 16, 1), + Workload("arop_used_sparse", "AROP USED sparse", ["AROP", "bench:array:sparse:text", "0", str(self.args.sparse_space - 1), "USED"], 25_000, 20, 2), + Workload("arset_dense_rand", "ARSET dense random update", ["ARSET", "bench:array:dense:num", "__rand_int__", "42"], 150_000, 50, 16, rand_range=self.args.dense_len), + Workload("armset_dense_4_rand", "ARMSET dense 4 random updates", ["ARMSET", "bench:array:dense:num", "__rand_int__", "11", "__rand_int__", "22", "__rand_int__", "33", "__rand_int__", "44"], 100_000, 50, 16, rand_range=self.args.dense_len), + Workload("arinsert_append_hot", "ARINSERT append hot path", ["ARINSERT", "bench:array:append", "x"], 50_000, 24, 8, setup="reset_append"), + Workload("arring_hot_1024", "ARRING size 1024 hot path", ["ARRING", "bench:array:ring", "1024", "x"], 100_000, 50, 16, setup="reset_ring"), + ] + + def run_workload(self, workload: Workload) -> Result: + if workload.setup: + getattr(self, workload.setup)() + if self.args.warmup and workload.warmup_requests > 0: + self.invoke_benchmark(workload, workload.warmup_requests, quiet=True) + raw = self.invoke_benchmark(workload, self.scale_requests(workload.requests), quiet=True) + qps = self.parse_qps(raw) + return Result( + name=workload.name, + description=workload.description, + qps=qps, + requests=self.scale_requests(workload.requests), + clients=workload.clients, + pipeline=workload.pipeline, + rand_range=workload.rand_range, + command=workload.command, + raw_output=raw.strip(), + ) + + def invoke_benchmark(self, workload: Workload, requests: int, quiet: bool) -> str: + cmd = [ + self.redis_benchmark, + "-h", self.host, + "-p", str(self.port), + "--dbnum", str(self.db), + "-n", str(requests), + "-c", str(workload.clients), + "-P", str(workload.pipeline), + "--seed", str(self.args.seed), + ] + if quiet: + cmd.append("-q") + if workload.rand_range: + cmd.extend(["-r", str(workload.rand_range)]) + cmd.extend(workload.command) + return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT) + + def parse_qps(self, raw: str) -> float: + m = QPS_RE.search(raw) + if not m: + raise BenchError(f"could not parse qps from redis-benchmark output:\n{raw}") + return float(m.group(1)) + + def scale_requests(self, requests: int) -> int: + scaled = int(requests * self.args.request_scale) + return max(1000, scaled) + + def reset_append(self) -> None: + self.cli(["DEL", "bench:array:append"]) + + def reset_ring(self) -> None: + self.cli(["DEL", "bench:array:ring"]) + + def print_summary(self) -> None: + print("\nsummary:") + print("| workload | qps | req | c | P | notes |") + print("|---|---:|---:|---:|---:|---|") + for r in self.results: + notes = r.description + if r.rand_range: + notes += f", rand=0..{r.rand_range - 1}" + print(f"| {r.name} | {r.qps:.2f} | {r.requests} | {r.clients} | {r.pipeline} | {notes} |") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Standalone Array benchmark harness. It uses DB 9 by default, " + "flushes that DB, loads deterministic Array datasets, and runs " + "custom redis-benchmark workloads." + ) + ) + parser.add_argument("--src-dir", help="Path to the src directory containing redis-server, redis-cli, and redis-benchmark") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=6395) + parser.add_argument("--db", type=int, default=9) + parser.add_argument("--start-server", action="store_true", default=True, + help="Start an ephemeral redis-server on --port (default: enabled)") + parser.add_argument("--no-start-server", dest="start_server", action="store_false", + help="Use an already running server instead of starting one") + parser.add_argument("--keep-server", action="store_true", + help="Do not stop the ephemeral server after the run") + parser.add_argument("--only", help="Comma-separated workload names to run") + parser.add_argument("--seed", type=int, default=12345) + parser.add_argument("--request-scale", type=float, default=1.0, + help="Scale factor applied to all workload request counts") + parser.add_argument("--warmup", action="store_true", default=True, + help="Run a short warmup before each benchmark (default: enabled)") + parser.add_argument("--no-warmup", dest="warmup", action="store_false") + parser.add_argument("--json-out", help="Optional path for machine-readable results") + parser.add_argument("--dense-len", type=int, default=16_384, + help="Number of contiguous dense elements to preload") + parser.add_argument("--sparse-space", type=int, default=30_000_000, + help="Logical range used by sparse benchmarks") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + try: + bench = RedisArrayBench(args) + return bench.run() + except BenchError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + except subprocess.CalledProcessError as exc: + output = exc.output if isinstance(exc.output, str) else exc.output.decode("utf-8", "replace") + print(output, file=sys.stderr) + return exc.returncode or 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/utils/generate-command-code.py b/utils/generate-command-code.py index 8a25039ad..b2137f1ae 100755 --- a/utils/generate-command-code.py +++ b/utils/generate-command-code.py @@ -34,6 +34,7 @@ GROUPS = { "geo": "COMMAND_GROUP_GEO", "stream": "COMMAND_GROUP_STREAM", "bitmap": "COMMAND_GROUP_BITMAP", + "array": "COMMAND_GROUP_ARRAY", "rate_limit": "COMMAND_GROUP_RATE_LIMIT", } @@ -603,6 +604,7 @@ const char *COMMAND_GROUP_STR[] = { "geo", "stream", "bitmap", + "array", "module", "rate_limit" }; From fab099cdcffb88fa5db7e78f9a70d7ab6da248d4 Mon Sep 17 00:00:00 2001 From: Shubham S Taple <155555100+ShubhamTaple@users.noreply.github.com> Date: Thu, 14 May 2026 11:47:47 +0530 Subject: [PATCH 03/19] Fix double free when loading streams with duplicate consumer PEL entries (#15095) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #15082 ## Problem Loading a stream from RDB/RESTORE with a malformed consumer PEL (the same pending ID listed twice for one consumer) hit an error path that called streamFreeNACK() on a nack that was still referenced from the group’s global PEL (cgroup->pel). Teardown then freed that nack again while destroying the stream, causing a double free and a possible server crash. ## Fix On the duplicate consumer PEL branch in src/rdb.c, stop calling streamFreeNACK(s, nack) when raxTryInsert(consumer->pel, …) fails. Keep reporting corruption and rely on decrRefCount(o) for cleanup, consistent with other paths where the nack is owned only by cgroup->pel. --- src/rdb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/rdb.c b/src/rdb.c index d6fb3f3d2..a450aff6e 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3584,7 +3584,6 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) rdbReportCorruptRDB("Duplicated consumer PEL entry " " loading a stream consumer " "group"); - streamFreeNACK(s, nack); decrRefCount(o); return NULL; } From 80621b1d0e371c075b59a8364d1b5e8f910447f3 Mon Sep 17 00:00:00 2001 From: sggeorgiev Date: Wed, 29 Oct 2025 16:02:57 +0200 Subject: [PATCH 04/19] Add DENYOOM flag to SUBSCRIBE, PSUBSCRIBE and SSUBSCRIBE commands Add the DENYOOM flag to SUBSCRIBE, PSUBSCRIBE, and SSUBSCRIBE commands to bring their memory protection behavior in line with other Redis commands. Problem: Currently, subscribe commands lack memory protection when Redis reaches its memory limit. This becomes problematic in two specific scenarios: 1. When the eviction policy doesn't allow eviction (e.g., noeviction) 2. When there are no evictable keys remaining in the database In these cases, memory usage from pub/sub subscribers can keep growing unchecked, potentially causing the Redis server to run out of memory. This behavior is inconsistent with other Redis commands, which are protected by the DENYOOM flag. Solution: Add the DENYOOM flag to all subscribe commands. When memory limits are reached, these commands will be rejected, preventing uncontrolled memory growth and aligning their behavior with other Redis commands. --- src/commands.def | 6 +- src/commands/psubscribe.json | 3 +- src/commands/ssubscribe.json | 3 +- src/commands/subscribe.json | 3 +- tests/unit/pubsub.tcl | 131 ++++++++++++++++++++++++++++++++++- 5 files changed, 139 insertions(+), 7 deletions(-) diff --git a/src/commands.def b/src/commands.def index 2726d1288..2aff10043 100644 --- a/src/commands.def +++ b/src/commands.def @@ -12547,13 +12547,13 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("rpush","Appends one or more elements to a list. Creates the key if it doesn't exist.","O(1) for each element added, so O(N) to add N elements when the command is called with multiple arguments.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,RPUSH_History,1,RPUSH_Tips,0,rpushCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_LIST,RPUSH_Keyspecs,1,NULL,2),.args=RPUSH_Args}, {MAKE_CMD("rpushx","Appends an element to a list only when the list exists.","O(1) for each element added, so O(N) to add N elements when the command is called with multiple arguments.","2.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,RPUSHX_History,1,RPUSHX_Tips,0,rpushxCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_LIST,RPUSHX_Keyspecs,1,NULL,2),.args=RPUSHX_Args}, /* pubsub */ -{MAKE_CMD("psubscribe","Listens for messages published to channels that match one or more patterns.","O(N) where N is the number of patterns to subscribe to.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,PSUBSCRIBE_History,0,PSUBSCRIBE_Tips,0,psubscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,PSUBSCRIBE_Keyspecs,0,NULL,1),.args=PSUBSCRIBE_Args}, +{MAKE_CMD("psubscribe","Listens for messages published to channels that match one or more patterns.","O(N) where N is the number of patterns to subscribe to.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,PSUBSCRIBE_History,0,PSUBSCRIBE_Tips,0,psubscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL|CMD_DENYOOM,0,PSUBSCRIBE_Keyspecs,0,NULL,1),.args=PSUBSCRIBE_Args}, {MAKE_CMD("publish","Posts a message to a channel.","O(N+M) where N is the number of clients subscribed to the receiving channel and M is the total number of subscribed patterns (by any client).","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,PUBLISH_History,0,PUBLISH_Tips,0,publishCommand,3,CMD_PUBSUB|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_MAY_REPLICATE|CMD_SENTINEL,0,PUBLISH_Keyspecs,0,NULL,2),.args=PUBLISH_Args}, {MAKE_CMD("pubsub","A container for Pub/Sub commands.","Depends on subcommand.","2.8.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,PUBSUB_History,0,PUBSUB_Tips,0,NULL,-2,0,0,PUBSUB_Keyspecs,0,NULL,0),.subcommands=PUBSUB_Subcommands}, {MAKE_CMD("punsubscribe","Stops listening to messages published to channels that match one or more patterns.","O(N) where N is the number of patterns to unsubscribe.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,PUNSUBSCRIBE_History,0,PUNSUBSCRIBE_Tips,0,punsubscribeCommand,-1,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,PUNSUBSCRIBE_Keyspecs,0,NULL,1),.args=PUNSUBSCRIBE_Args}, {MAKE_CMD("spublish","Post a message to a shard channel","O(N) where N is the number of clients subscribed to the receiving shard channel.","7.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SPUBLISH_History,0,SPUBLISH_Tips,0,spublishCommand,3,CMD_PUBSUB|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_MAY_REPLICATE,0,SPUBLISH_Keyspecs,1,NULL,2),.args=SPUBLISH_Args}, -{MAKE_CMD("ssubscribe","Listens for messages published to shard channels.","O(N) where N is the number of shard channels to subscribe to.","7.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SSUBSCRIBE_History,0,SSUBSCRIBE_Tips,0,ssubscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,SSUBSCRIBE_Keyspecs,1,NULL,1),.args=SSUBSCRIBE_Args}, -{MAKE_CMD("subscribe","Listens for messages published to channels.","O(N) where N is the number of channels to subscribe to.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SUBSCRIBE_History,0,SUBSCRIBE_Tips,0,subscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,SUBSCRIBE_Keyspecs,0,NULL,1),.args=SUBSCRIBE_Args}, +{MAKE_CMD("ssubscribe","Listens for messages published to shard channels.","O(N) where N is the number of shard channels to subscribe to.","7.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SSUBSCRIBE_History,0,SSUBSCRIBE_Tips,0,ssubscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_DENYOOM,0,SSUBSCRIBE_Keyspecs,1,NULL,1),.args=SSUBSCRIBE_Args}, +{MAKE_CMD("subscribe","Listens for messages published to channels.","O(N) where N is the number of channels to subscribe to.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SUBSCRIBE_History,0,SUBSCRIBE_Tips,0,subscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL|CMD_DENYOOM,0,SUBSCRIBE_Keyspecs,0,NULL,1),.args=SUBSCRIBE_Args}, {MAKE_CMD("sunsubscribe","Stops listening to messages posted to shard channels.","O(N) where N is the number of shard channels to unsubscribe.","7.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SUNSUBSCRIBE_History,0,SUNSUBSCRIBE_Tips,0,sunsubscribeCommand,-1,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,SUNSUBSCRIBE_Keyspecs,1,NULL,1),.args=SUNSUBSCRIBE_Args}, {MAKE_CMD("unsubscribe","Stops listening to messages posted to channels.","O(N) where N is the number of channels to unsubscribe.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,UNSUBSCRIBE_History,0,UNSUBSCRIBE_Tips,0,unsubscribeCommand,-1,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,UNSUBSCRIBE_Keyspecs,0,NULL,1),.args=UNSUBSCRIBE_Args}, /* rate_limit */ diff --git a/src/commands/psubscribe.json b/src/commands/psubscribe.json index cab5d14ef..8c56db2cc 100644 --- a/src/commands/psubscribe.json +++ b/src/commands/psubscribe.json @@ -11,7 +11,8 @@ "NOSCRIPT", "LOADING", "STALE", - "SENTINEL" + "SENTINEL", + "DENYOOM" ], "arguments": [ { diff --git a/src/commands/ssubscribe.json b/src/commands/ssubscribe.json index 46373d541..5bebc6c8c 100644 --- a/src/commands/ssubscribe.json +++ b/src/commands/ssubscribe.json @@ -10,7 +10,8 @@ "PUBSUB", "NOSCRIPT", "LOADING", - "STALE" + "STALE", + "DENYOOM" ], "arguments": [ { diff --git a/src/commands/subscribe.json b/src/commands/subscribe.json index bdf12b726..63e838d7d 100644 --- a/src/commands/subscribe.json +++ b/src/commands/subscribe.json @@ -12,7 +12,8 @@ "NOSCRIPT", "LOADING", "STALE", - "SENTINEL" + "SENTINEL", + "DENYOOM" ], "arguments": [ { diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 115970a31..afcddee77 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -1369,5 +1369,134 @@ start_server {tags {"pubsub network"}} { assert_equal [r publish foo vaz] {1} assert_equal [r read] {message foo vaz} } {} {resp3} - +} + +start_server {tags {"pubsub network"}} { + # Helper proc for tests that subscribe multiple times until hitting OOM + proc test_subscribe_oom_loop {cmd description clients} { + test "$cmd $description fails with OOM when memory limit exceeded" { + # Set 10MB memory limit + r config set maxmemory 10485760 + r config set maxmemory-policy noeviction + + # Create clients + if {$clients == 1} { + set rd [redis_deferring_client] + } else { + set rd1 [redis_deferring_client] + set rd2 [redis_deferring_client] + } + + set base_str [string repeat "a" 2048] + set success_count 0 + set oom_occurred 0 + + # Try to subscribe until we hit OOM + for {set i 0} {$i < 5000} {incr i} { + # Select client + if {$clients == 1} { + set client $rd + } else { + set client [expr {$i % 2 ? $rd1 : $rd2}] + } + + # Build channel/pattern name + if {$cmd eq "psubscribe"} { + set channel_name "${base_str}${i}*" + } else { + set channel_name "${base_str}${i}" + } + + $client $cmd $channel_name + if {[catch {$client read} err]} { + if {[string match "*OOM command not allowed*" $err]} { + set oom_occurred 1 + break + } + error "Unexpected error: $err" + } + incr success_count + } + + # Verify we had at least one success and hit OOM + assert {$success_count > 10} + assert {$oom_occurred == 1} + + # Close clients + if {$clients == 1} { + $rd close + } else { + $rd1 close + $rd2 close + } + } + } + + # Helper proc for tests with single large channel that immediately fails + proc test_subscribe_large_channel_oom {cmd channel_type} { + test "$cmd with large $channel_type name fails due to OOM" { + # Set maxmemory to 2MB + r config set maxmemory 2097152 + r config set maxmemory-policy noeviction + + # Create large channel/pattern name: 2MB + set channel_name [string repeat "a" 2097152] + + # Create a single pubsub client + set rd [redis_deferring_client] + + # Subscribe should fail with OOM error + $rd $cmd $channel_name + assert_error "*OOM command not allowed when used memory > 'maxmemory'*" {$rd read} + + # Cleanup + $rd close + } + } + + # Helper proc for tests with small success then large failure + proc test_subscribe_small_then_large_oom {cmd channel_type} { + test "$cmd succeeds with small $channel_type but fails with large $channel_type due to OOM" { + # Set maxmemory to 5MB + r config set maxmemory 5242880 + r config set maxmemory-policy noeviction + + # Create channel names: first 10KB, second 5MB + set channel1 [string repeat "a" 10240] + set channel2 [string repeat "b" 5242880] + + # Create a single pubsub client + set rd [redis_deferring_client] + + # First subscribe should succeed (10KB) + $rd $cmd $channel1 + set reply1 [$rd read] + assert_equal [list $cmd] [lindex $reply1 0] + + # Second subscribe should fail with OOM error (5MB exceeds limit) + $rd $cmd $channel2 + assert_error "*OOM command not allowed when used memory > 'maxmemory'*" {$rd read} + + # Cleanup + $rd close + } + } + + # Multiple subscriptions until OOM tests + test_subscribe_oom_loop "subscribe" "" 1 + test_subscribe_oom_loop "ssubscribe" "" 1 + test_subscribe_oom_loop "psubscribe" "" 1 + test_subscribe_oom_loop "subscribe" "with 2 clients" 2 + test_subscribe_oom_loop "ssubscribe" "with 2 clients" 2 + test_subscribe_oom_loop "psubscribe" "with 2 clients" 2 + + # Single large channel immediate OOM tests + test_subscribe_large_channel_oom "subscribe" "channel" + test_subscribe_large_channel_oom "psubscribe" "pattern" + test_subscribe_large_channel_oom "ssubscribe" "shard channel" + + # Small success then large failure tests + test_subscribe_small_then_large_oom "subscribe" "channel" + test_subscribe_small_then_large_oom "psubscribe" "pattern" + test_subscribe_small_then_large_oom "ssubscribe" "channel" } From c4e340570495ff24b04c3507e29320711f1b82c4 Mon Sep 17 00:00:00 2001 From: Sergei Georgiev Date: Thu, 27 Nov 2025 11:41:11 +0200 Subject: [PATCH 05/19] Invalid Memory Access in Redis RESTORE Command (CVE-2026-25243) --- modules/vector-sets/vset.c | 46 +++++++++++++++-- src/rdb.c | 6 +-- src/sds.c | 9 +++- src/zipmap.c | 7 +++ tests/integration/corrupt-dump.tcl | 83 ++++++++++++++++++++++++++++++ tests/unit/dump.tcl | 13 +++++ 6 files changed, 155 insertions(+), 9 deletions(-) diff --git a/modules/vector-sets/vset.c b/modules/vector-sets/vset.c index 618723e91..46e504b78 100644 --- a/modules/vector-sets/vset.c +++ b/modules/vector-sets/vset.c @@ -1987,14 +1987,50 @@ void *VectorSetRdbLoad(RedisModuleIO *rdb, int encver) { uint32_t input_dim = RedisModule_LoadUnsigned(rdb); if (RedisModule_IsIOError(rdb)) goto ioerr; uint32_t output_dim = dim; - size_t matrix_size = sizeof(float) * input_dim * output_dim; + + /* Sanity check dimensions to avoid absurd / degenerate matrices. */ + if (input_dim == 0 || output_dim == 0) { + RedisModule_LogIOError(rdb, "warning", + "Invalid projection matrix dimensions: input_dim=%u, output_dim=%u", + (unsigned)input_dim, (unsigned)output_dim); + goto ioerr; + } + + /* Check for overflow in matrix_size = sizeof(float) * input_dim * output_dim. */ + #if SIZE_MAX == UINT32_MAX + if ((size_t)output_dim > SIZE_MAX / sizeof(float)) { + RedisModule_LogIOError(rdb, "warning", + "Projection matrix size overflow (output_dim too large): input_dim=%u, output_dim=%u", + (unsigned)input_dim, (unsigned)output_dim); + goto ioerr; + } + #endif + + size_t max_input = SIZE_MAX / (sizeof(float) * (size_t)output_dim); + if ((size_t)input_dim > max_input) { + RedisModule_LogIOError(rdb, "warning", + "Projection matrix size overflow: input_dim=%u, output_dim=%u", + (unsigned)input_dim, (unsigned)output_dim); + goto ioerr; + } + + size_t matrix_size = sizeof(float) * (size_t)input_dim * (size_t)output_dim; + + /* Load projection matrix as a binary blob and validate length. */ + size_t blob_len = 0; + char *matrix_blob = RedisModule_LoadStringBuffer(rdb, &blob_len); + if (matrix_blob == NULL) goto ioerr; + + if (blob_len != matrix_size) { + RedisModule_LogIOError(rdb, "warning", + "Mismatching projection matrix length: expected=%zu, got=%zu", + matrix_size, blob_len); + RedisModule_Free(matrix_blob); + goto ioerr; + } vset->proj_matrix = RedisModule_Alloc(matrix_size); vset->proj_input_size = input_dim; - - // Load projection matrix as a binary blob - char *matrix_blob = RedisModule_LoadStringBuffer(rdb, NULL); - if (matrix_blob == NULL) goto ioerr; memcpy(vset->proj_matrix, matrix_blob, matrix_size); RedisModule_Free(matrix_blob); } diff --git a/src/rdb.c b/src/rdb.c index a450aff6e..31e813199 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3058,13 +3058,13 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) /* search for duplicate records */ sds field = sdstrynewlen(fstr, flen); - int field_added = (field != NULL && dictAdd(dupSearchDict, field, NULL) == DICT_OK); - if (!field_added || !lpSafeToAdd(lp, (size_t)flen + vlen)) { + if (!field || !lpSafeToAdd(lp, (size_t)flen + vlen) || + dictAdd(dupSearchDict, field, NULL) != DICT_OK) { rdbReportCorruptRDB("Hash zipmap with dup elements, or big length (%u)", flen); /* If field was not added to dict, we still own it. * If it was added, dict owns it and dictRelease will free it. */ - if (!field_added) sdsfree(field); dictRelease(dupSearchDict); + sdsfree(field); lpFree(lp); zfree(encoded); o->ptr = NULL; diff --git a/src/sds.c b/src/sds.c index 14babcc51..2dacb0fdf 100644 --- a/src/sds.c +++ b/src/sds.c @@ -105,7 +105,14 @@ sds _sdsnewlen(const void *init, size_t initlen, int trymalloc) { int hdrlen = sdsHdrSize(type); size_t bufsize; - assert(initlen + hdrlen + 1 > initlen); /* Catch size_t overflow */ + if (trymalloc) { + /* protect against size_t overflow */ + if (initlen + hdrlen + 1 <= initlen) + return NULL; + } else { + assert(initlen + hdrlen + 1 > initlen); /* Catch size_t overflow */ + } + sh = trymalloc? s_trymalloc_usable(hdrlen+initlen+1, &bufsize) : s_malloc_usable(hdrlen+initlen+1, &bufsize); diff --git a/src/zipmap.c b/src/zipmap.c index 51c64ca81..e3981d810 100644 --- a/src/zipmap.c +++ b/src/zipmap.c @@ -387,6 +387,10 @@ int zipmapValidateIntegrity(unsigned char *zm, size_t size, int deep) { /* read the field name length */ l = zipmapDecodeLength(p); + /* Sanity check: length < 254 must be encoded in 1 byte, not 5 bytes */ + if (l < ZIPMAP_BIGLEN && s != 1) + return 0; + p += s; /* skip the encoded field size */ p += l; /* skip the field */ @@ -402,6 +406,9 @@ int zipmapValidateIntegrity(unsigned char *zm, size_t size, int deep) { /* read the value length */ l = zipmapDecodeLength(p); + /* Sanity check: length < 254 must be encoded in 1 byte, not 5 bytes */ + if (l < ZIPMAP_BIGLEN && s != 1) + return 0; p += s; /* skip the encoded value size*/ e = *p++; /* skip the encoded free space (always encoded in one byte) */ p += l+e; /* skip the value and free space */ diff --git a/tests/integration/corrupt-dump.tcl b/tests/integration/corrupt-dump.tcl index 7a275cf83..d333a4764 100644 --- a/tests/integration/corrupt-dump.tcl +++ b/tests/integration/corrupt-dump.tcl @@ -1070,5 +1070,88 @@ test {corrupt payload: stream live entry count integer overflow bypasses length } } +test {corrupt payload: zipmap - element wouldn't fit in listpack} { + # Redis converts legacy zipmap encoded hashes to listpacks. + # This test creates a zipmap entry with a 1GB value which cannot + # fit into a listpack and verifies that RESTORE fails. + + start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no proto-max-bulk-len 2147483648 client-query-buffer-limit 2147483648]] { + proc zipmap_encode_len {len} { + if {$len < 254} { + return [binary format c $len] + } else { + return [binary format ci 254 $len] + } + } + r config set sanitize-dump-payload no + + # Generates Zipmap with 1GB value - should fail lpSafeToAdd check + set val_len [expr {1024 * 1024 * 1024 + 1}] + + # Zipmap has 1 element + set zm [binary format c 1] + # Field is 1 byte long + append zm [zipmap_encode_len 1] + append zm "k" + # Value is 1GB long + append zm [zipmap_encode_len $val_len] + append zm [binary format c 0] + append zm [string repeat "A" $val_len] + # ZIPMAP_END marker + append zm [binary format c 255] + # Prepend RDB header + set zm_len [string length $zm] + set rdb_len [binary format cI 0x80 $zm_len] + set dump [binary format c 9] + append dump $rdb_len + append dump $zm + append dump [binary format s 9] + append dump [binary format w 0] + + catch {r RESTORE _hash 0 $dump} err + assert_match "*Bad data format*" $err + } +} {} {large-memory} + +test {corrupt payload: zipmap - 5 bytes length encoding for a small field} { + start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no]] { + catch { + r restore key 0 "\x09\x11\x01\xfe\x04\x00\x00\x00\x01\x00\xff\x00\x04\x00\x76\x61\x6c\x31\xff\x09\x00\xf9\xd5\xa4\xf7\x7d\x00\x3f\x1b" + } err + assert_match "*Bad data format*" $err + verify_log_message 0 "*integrity check failed*" 0 + } +} + +test {corrupt payload: zipmap - 5 bytes length encoding for a small value} { + start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no]] { + catch { + r restore key 0 "\x09\x0e\x01\x01\x6b\xfe\x04\x00\x00\x00\x00\x76\x61\x6c\x31\xff\x09\x00\xd0\xf9\xe4\x1d\xe4\xfb\x11\x4c" + } err + assert_match "*Bad data format*" $err + verify_log_message 0 "*integrity check failed*" 0 + } +} + +test {corrupt payload: zipmap - 5 bytes length encoding and a huge field} { + start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no] ] { + catch { + r restore key 0 "\x09\x41\x15\x02\x04\x6b\x65\x79\x31\x04\x00\x76\x61\x6c\x31\xfe\x04\x00\x00\x00\xfe\xff\xff\xff\xfd\x00\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\x42\xff\x09\x00\x54\x2f\x0a\xca\x4e\x5c\x49\x9f" + } err + assert_match "*Bad data format*" $err + verify_log_message 0 "*integrity check failed*" 0 + } +} + +test {corrupt payload: stream - duplicated consumer PEL entry} { + start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no] ] { + catch { + r restore key 0 "\x15\x01\x10\x00\x00\x01\x9b\x0d\x56\xa9\xb7\x00\x00\x00\x00\x00\x00\x00\x00\xc3\x39\x40\x42\x15\x42\x00\x00\x00\x11\x00\x02\x01\x00\x01\x01\x01\x86\x66\x69\x65\x6c\x64\x31\x07\x00\x01\x40\x0f\x0a\x00\x01\x86\x76\x61\x6c\x75\x65\x31\x07\x04\x20\x0b\x02\xcd\xd9\x02\xe0\x01\x22\x01\x32\x07\x80\x1a\x04\x32\x07\x06\x01\xff\x02\x81\x00\x00\x01\x9b\x0d\x56\xb7\x90\x00\x81\x00\x00\x01\x9b\x0d\x56\xa9\xb7\x00\x00\x00\x02\x01\x07\x6d\x79\x67\x72\x6f\x75\x70\x81\x00\x00\x01\x9b\x0d\x56\xb7\x90\x00\x02\x02\x00\x00\x01\x9b\x0d\x56\xa9\xb7\x00\x00\x00\x00\x00\x00\x00\x00\x80\xd9\x56\x0d\x9b\x01\x00\x00\x01\x00\x00\x01\x9b\x0d\x56\xb7\x90\x00\x00\x00\x00\x00\x00\x00\x00\x80\xd9\x56\x0d\x9b\x01\x00\x00\x01\x01\x09\x63\x6f\x6e\x73\x75\x6d\x65\x72\x31\x80\xd9\x56\x0d\x9b\x01\x00\x00\x80\xd9\x56\x0d\x9b\x01\x00\x00\x02\x00\x00\x01\x9b\x0d\x56\xa9\xb7\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x9b\x0d\x56\xa9\xb7\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x09\x00\x4b\xe0\x99\x30\x67\x4d\xe5\x87" + } err + assert_match "*Bad data format*" $err + verify_log_message 0 "*Stream consumer PEL entry already has a consumer assigned*" 0 + } +} + } ;# tags diff --git a/tests/unit/dump.tcl b/tests/unit/dump.tcl index 923e391b4..1a5c01ea4 100644 --- a/tests/unit/dump.tcl +++ b/tests/unit/dump.tcl @@ -158,6 +158,19 @@ start_server {tags {"dump"}} { close_replication_stream $repl } {} {needs:repl} + test {RESTORE fail with invalid payload size} { + # Payload with mismatched size: claims 0xFFFFFFFFFFFFFFF7 bytes (max uint64 - 8) but provides no data + # \x00 = String type + # \x81 = 64-bit length marker + # \xFF\xFF\xFF\xFF\xFF\xFF\xFF\xF7 = 18446744073709551607 in big-endian + # \x0c\x00 = RDB version + # \x00... = fake CRC64 + set encoded "\x00\x81\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xF7\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x00" + r del test + catch {r restore test 0 $encoded} e + set e + } {*Bad data format*} + test {DUMP of non existing key returns nil} { r dump nonexisting_key } {} From 8cdf9391da50ba79784cce1d141125e5a4f31b35 Mon Sep 17 00:00:00 2001 From: Stav-Levi <45394834+StavRLevi@users.noreply.github.com> Date: Mon, 29 Dec 2025 08:49:58 +0200 Subject: [PATCH 06/19] Fix crash in Lua debugger when error object is not a string --- src/eval.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/eval.c b/src/eval.c index 30acedd9e..0edea5ddd 100644 --- a/src/eval.c +++ b/src/eval.c @@ -1502,7 +1502,9 @@ void ldbEval(lua_State *lua, sds *argv, int argc) { sdsfree(code); sdsfree(expr); if (lua_pcall(lua,0,1,0)) { - ldbLog(sdscatfmt(sdsempty()," %s",lua_tostring(lua,-1))); + const char *err = lua_tostring(lua,-1); + ldbLog(sdscatfmt(sdsempty()," %s", + err ? err : "(error object is not a string)")); lua_pop(lua,1); return; } From 837ca7f8f4d835fdc7754cef3c944a429dc28c29 Mon Sep 17 00:00:00 2001 From: Ozan Tezcan Date: Tue, 14 Apr 2026 08:47:04 +0300 Subject: [PATCH 07/19] Fix use-after-free when fullsync happens while replica is running a timed out script (CVE-2026-23631) Fullsync triggers emptyData and scriptingReset which free the scripting/function engine. If a timed out script is still running on the replica, this causes a use-after-free. Delay fullsync processing in readSyncBulkPayload until the script finishes. --- src/replication.c | 5 ++ tests/integration/replication.tcl | 77 +++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/src/replication.c b/src/replication.c index 6726cff19..44d81ba51 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2251,6 +2251,11 @@ void replicationAttachToNewMaster(void) { /* Asynchronously read the SYNC payload we receive from a master */ #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024*1024*8) /* 8 MB */ void readSyncBulkPayload(connection *conn) { + /* During full sync, the functions engine is freed right before loading + * the RDB. To avoid this happening while a function is still running, + * delay full sync processing until it finishes. */ + if (isInsideYieldingLongCommand()) return; + char buf[PROTO_IOBUF_LEN]; ssize_t nread, readlen, nwritten; int use_diskless_load = useDisklessLoad(); diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index b3a03a2f9..b3020ab5f 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -1878,3 +1878,80 @@ start_server {tags {"repl external:skip"}} { } } } + +# Fullsync should not free the functions lib ctx while the replica has +# a timed out function that is still running. +foreach type {script function} { + start_server {tags {"repl external:skip"}} { + start_server {} { + set master [srv -1 client] + set master_host [srv -1 host] + set master_port [srv -1 port] + set replica [srv 0 client] + + test "Fullsync should not free scripting engine on a replica while a $type is running" { + $master config set repl-diskless-sync yes + $master config set repl-diskless-sync-delay 0 + # Set small client output buffer limit to trigger fullsync quickly + $master config set client-output-buffer-limit "replica 1k 1k 0" + $replica config set repl-diskless-load yes + $replica config set busy-reply-threshold 1 ;# script timeout in 1 ms + + # Load function + if {$type eq "function"} { + $master function load replace {#!lua name=blocklib + redis.register_function{ + function_name='blockfunc', + callback=function() while true do end end, + flags={'no-writes'} + } + } + } + + # Start replication + $replica replicaof $master_host $master_port + wait_for_sync $replica + + # Run the blocking script on replica + set rd [redis_deferring_client] + if {$type eq "script"} { + $rd eval {while true do end} 0 + } else { + $rd fcall_ro blockfunc 0 + } + + # Verify replica replies with BUSY + wait_for_condition 50 100 { + [catch {$replica ping} e] == 1 && [string match {*BUSY*} $e] + } else { + fail "$type didn't become busy" + } + + # Fills client output buffer and triggers fullsync + populate 5 bigkey 1000000 -1 + wait_for_condition 50 100 { + [s -1 sync_full] >= 2 + } else { + fail "Fullsync was not triggered" + } + + # Verify replica is still running the function + after 1000 + catch {$replica ping} e + assert_match {*BUSY*} $e "replica should still reply with BUSY" + + if {$type eq "script"} { + $replica script kill + } else { + $replica function kill + } + + # Verify replica is responsive again + catch {$rd read} result + $rd close + wait_for_sync $replica + assert_equal [$replica ping] "PONG" + } + } + } +} From 5c355b68ecc87a9b50ebf644895e8e45bbf6072c Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Mon, 29 Dec 2025 16:20:41 +0800 Subject: [PATCH 08/19] Fix use-after-free when evicting blocked client during unblock (CVE-2026-23479) When re-executing a pending command after unblocking, check the return value of `processCommandAndResetClient` and exit if needed. --- src/blocked.c | 8 +++++++- tests/unit/client-eviction.tcl | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/blocked.c b/src/blocked.c index b973adeaf..74558b485 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -699,7 +699,13 @@ static void unblockClientOnKey(client *c, robj *key) { client *old_client = server.current_client; server.current_client = c; enterExecutionUnit(1, 0); - processCommandAndResetClient(c); + if (processCommandAndResetClient(c) == C_ERR) { + /* Client was freed during command processing, exit immediately */ + exitExecutionUnit(); + server.current_client = old_client; + return; + } + if (!(c->flags & CLIENT_BLOCKED)) { if (c->flags & CLIENT_MODULE) { moduleCallCommandUnblockedHandler(c); diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl index 2e08715b8..afe32e4f9 100644 --- a/tests/unit/client-eviction.tcl +++ b/tests/unit/client-eviction.tcl @@ -611,5 +611,34 @@ start_server {} { } } +start_server {} { + r flushall + r client no-evict on + r config set maxmemory-clients 0 + + test "Verify blocked client eviction during unblock does not cause use-after-free" { + # Create a deferring client that will be blocked on stream + # Use a long stream name to make client memory usage exceed 200000 bytes + set rd [redis_deferring_client] + $rd XREAD BLOCK 0 STREAMS mystream stream_[string repeat x 200000] $ $ + + # Wait for the client to be blocked + wait_for_condition 50 100 { + [s blocked_clients] eq {1} + } else { + fail "Client was not blocked" + } + + # Now lower MAXMEMORY-CLIENTS to a low value and use + # XADD to unblock the blocked client, triggering eviction. + r MULTI + r CONFIG SET MAXMEMORY-CLIENTS 100000 ;# Put in MULTI to defer blocked client eviction until after EXEC + r XADD mystream * field val + r EXEC + r PING + $rd close + } +} + } ;# tags From 22f1ab6e2756cc44f7f978ee1cd444187839444f Mon Sep 17 00:00:00 2001 From: Stav-Levi <45394834+StavRLevi@users.noreply.github.com> Date: Wed, 5 Nov 2025 08:55:28 +0200 Subject: [PATCH 09/19] Fix cluster AUX-field newline/control-character injection bricks a node on restart * fix cluster AUX-field newline/control-character injection bricks a node on restart --- src/cluster.c | 7 +- src/config.c | 20 +++++- tests/unit/cluster/announced-endpoints.tcl | 84 ++++++++++++++++++++++ 3 files changed, 109 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 98bb0ebda..637b5dd9a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -804,7 +804,12 @@ int verifyClusterNodeId(const char *name, int length) { } int isValidAuxChar(int c) { - return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); + /* Reject control characters (0x00-0x1F and 0x7F). */ + if (iscntrl(c)) { + return 0; + } + /* Reject forbidden characters including nodes.conf delimiters and special parsing characters */ + return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~,= \"'\\", c) == NULL); } int isValidAuxString(char *s, unsigned int length) { diff --git a/src/config.c b/src/config.c index 394d0fa01..994f8ae34 100644 --- a/src/config.c +++ b/src/config.c @@ -24,6 +24,7 @@ #include #include #include +#include /*----------------------------------------------------------------------------- * Config file name-value maps. @@ -2452,6 +2453,23 @@ static int isValidAnnouncedHostname(char *val, const char **err) { return 1; } +/* Validation function for cluster-announce-ip. + * Ensures the IP address is valid and rejects control characters. */ +static int isValidClusterAnnounceIp(char *val, const char **err) { + unsigned char buf[sizeof(struct in6_addr)]; + /* Empty string is allowed - it will be converted to NULL by EMPTY_STRING_IS_NULL flag */ + if (val[0] == '\0') { + return 1; + } + + if (inet_pton(AF_INET, val, buf) != 1 && + inet_pton(AF_INET6, val, buf) != 1) { + *err = "Cluster announce IP must be a valid IPv4 or IPv6 address"; + return 0; + } + return 1; +} + /* Validate specified string is a valid proc-title-template */ static int isValidProcTitleTemplate(char *val, const char **err) { if (!validateProcTitleTemplate(val)) { @@ -3186,7 +3204,7 @@ standardConfig static_configs[] = { createStringConfig("pidfile", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.pidfile, NULL, NULL, NULL), createStringConfig("replica-announce-ip", "slave-announce-ip", MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.slave_announce_ip, NULL, NULL, NULL), createStringConfig("masteruser", NULL, MODIFIABLE_CONFIG | SENSITIVE_CONFIG, EMPTY_STRING_IS_NULL, server.masteruser, NULL, NULL, NULL), - createStringConfig("cluster-announce-ip", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.cluster_announce_ip, NULL, NULL, updateClusterIp), + createStringConfig("cluster-announce-ip", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.cluster_announce_ip, NULL, isValidClusterAnnounceIp, updateClusterIp), createStringConfig("cluster-config-file", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.cluster_configfile, "nodes.conf", NULL, NULL), createStringConfig("cluster-announce-hostname", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.cluster_announce_hostname, NULL, isValidAnnouncedHostname, updateClusterHostname), createStringConfig("cluster-announce-human-nodename", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.cluster_announce_human_nodename, NULL, isValidAnnouncedNodename, updateClusterHumanNodename), diff --git a/tests/unit/cluster/announced-endpoints.tcl b/tests/unit/cluster/announced-endpoints.tcl index a37ca58d1..5784ea6f5 100644 --- a/tests/unit/cluster/announced-endpoints.tcl +++ b/tests/unit/cluster/announced-endpoints.tcl @@ -72,4 +72,88 @@ start_cluster 2 2 {tags {external:skip cluster}} { fail "Cluster announced port was not updated in cluster slots" } } + + # Tests for cluster-announce-ip validation + test "cluster-announce-ip validation" { + catch {R 0 config set cluster-announce-ip "192.168.1.100\nnext"} err + assert_match "*valid IPv4 or IPv6*" $err + + catch {R 0 config set cluster-announce-ip "10.0.0.1\ttab"} err + assert_match "*valid IPv4 or IPv6*" $err + + catch {R 0 config set cluster-announce-ip "1.2.3.4\r\n"} err + assert_match "*valid IPv4 or IPv6*" $err + + catch {R 0 config set cluster-announce-ip "redis-node-1.example.com"} err + assert_match "*valid IPv4 or IPv6*" $err + + catch {R 0 config set cluster-announce-ip "192.168.1"} err + assert_match "*valid IPv4 or IPv6*" $err + + # Accept valid IPv4 + R 0 config set cluster-announce-ip "192.168.1.100" + assert_equal "192.168.1.100" [lindex [R 0 config get cluster-announce-ip] 1] + + # Accept valid IPv6 + R 0 config set cluster-announce-ip "2001:db8::1" + assert_equal "2001:db8::1" [lindex [R 0 config get cluster-announce-ip] 1] + + # Can be cleared + R 0 config set cluster-announce-ip "" + assert_equal "" [lindex [R 0 config get cluster-announce-ip] 1] + } + + # Tests for cluster-announce-human-nodename validation + test "cluster-announce-human-nodename validation" { + # Reject control characters + catch {R 0 config set cluster-announce-human-nodename "badchar\nnext"} err + assert_match "*invalid character*" $err + + catch {R 0 config set cluster-announce-human-nodename "bad\ttab"} err + assert_match "*invalid character*" $err + + catch {R 0 config set cluster-announce-human-nodename "bad\r\nline"} err + assert_match "*invalid character*" $err + + # Reject delimiter characters (comma, equals, space) + catch {R 0 config set cluster-announce-human-nodename "bad,comma"} err + assert_match "*invalid character*" $err + + catch {R 0 config set cluster-announce-human-nodename "bad=equals"} err + assert_match "*invalid character*" $err + + catch {R 0 config set cluster-announce-human-nodename "bad space"} err + assert_match "*invalid character*" $err + + # Reject quote characters (double quote, single quote, backslash) + catch {R 0 config set cluster-announce-human-nodename "bad\"quote"} err + assert_match "*invalid character*" $err + + catch {R 0 config set cluster-announce-human-nodename "bad'quote"} err + assert_match "*invalid character*" $err + + catch {R 0 config set cluster-announce-human-nodename "bad\\slash"} err + assert_match "*invalid character*" $err + + # Accept valid names + R 0 config set cluster-announce-human-nodename "my-redis-node-1" + assert_equal "my-redis-node-1" [lindex [R 0 config get cluster-announce-human-nodename] 1] + } + + # DoS prevention test: verify server can restart after CLUSTER SAVECONFIG + test "cluster-announce-ip persists correctly with CLUSTER SAVECONFIG" { + R 0 config set cluster-announce-ip "192.168.1.100" + R 0 cluster saveconfig + + # Verify the IP appears in CLUSTER NODES output + assert_match "*192.168.1.100*" [R 0 cluster nodes] + } + + test "cluster-announce-human-nodename persists correctly with CLUSTER SAVECONFIG" { + R 0 config set cluster-announce-human-nodename "production-node-1" + R 0 cluster saveconfig + + # Verify the nodename is set correctly + assert_equal "production-node-1" [lindex [R 0 config get cluster-announce-human-nodename] 1] + } } From bc904dab04b16b4bf3d6c81b591d197f58f93afa Mon Sep 17 00:00:00 2001 From: ofiryanai Date: Sun, 30 Nov 2025 21:04:03 +0200 Subject: [PATCH 10/19] Vector Sets: Limit sane max dim Co-authored-by: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> --- .../vector-sets/tests/dimension_max_limit.py | 129 ++++++++++++++++++ modules/vector-sets/vset.c | 45 +++--- 2 files changed, 155 insertions(+), 19 deletions(-) create mode 100644 modules/vector-sets/tests/dimension_max_limit.py diff --git a/modules/vector-sets/tests/dimension_max_limit.py b/modules/vector-sets/tests/dimension_max_limit.py new file mode 100644 index 000000000..5a142d441 --- /dev/null +++ b/modules/vector-sets/tests/dimension_max_limit.py @@ -0,0 +1,129 @@ +from test import TestCase, generate_random_vector +import struct +import redis.exceptions + +MAX_DIM = 65536 + + +class DimensionMaxLimitVaddAtLimit(TestCase): + def getname(self): + return "[regression] VADD VALUES dim == MAX_DIM accepted" + + def estimated_runtime(self): + return 0.5 + + def test(self): + dim = MAX_DIM + vec = generate_random_vector(dim) + + result = self.redis.execute_command( + 'VADD', self.test_key, + 'VALUES', dim, + *[str(x) for x in vec], + f"{self.test_key}:item:maxdim") + assert result == 1, "VADD with dimension at the limit should succeed" + + +class DimensionMaxLimitVaddAboveLimit(TestCase): + def getname(self): + return "[regression] VADD VALUES dim > MAX_DIM rejected" + + def estimated_runtime(self): + return 0.1 + + def test(self): + too_big_dim = MAX_DIM + 1 + too_big_vec = generate_random_vector(16) + try: + self.redis.execute_command( + 'VADD', self.test_key, + 'VALUES', too_big_dim, + *[str(x) for x in too_big_vec], + f"{self.test_key}:item:toolarge") + assert False, "VADD with dimension above the limit should fail" + except redis.exceptions.ResponseError as e: + # parseVector returns NULL so caller uses the generic invalid spec error + assert "invalid vector specification" in str(e), ( + f"Expected invalid vector specification error, got: {e}") + + +class DimensionMaxLimitVsimAtLimit(TestCase): + def getname(self): + return "[regression] VSIM VALUES dim == MAX_DIM accepted" + + def estimated_runtime(self): + return 0.5 + + def test(self): + # Insert a vector at the maximum allowed dimension, then query at the same dimension. + dim = MAX_DIM + base_vec = generate_random_vector(dim) + + result = self.redis.execute_command( + 'VADD', self.test_key, + 'VALUES', dim, + *[str(x) for x in base_vec], + f"{self.test_key}:item:1") + assert result == 1, "VADD with dimension at the limit should succeed" + + query = generate_random_vector(dim) + res = self.redis.execute_command( + 'VSIM', self.test_key, + 'VALUES', dim, + *[str(x) for x in query], + 'COUNT', 1) + assert isinstance(res, list), "VSIM with dimension at the limit should return a list" + + +class DimensionMaxLimitVsimAboveLimit(TestCase): + def getname(self): + return "[regression] VSIM VALUES dim > MAX_DIM rejected" + + def estimated_runtime(self): + return 0.1 + + def test(self): + # Create a small index, then issue a VSIM with an over-limit dimension. + base_dim = 16 + base_vec = generate_random_vector(base_dim) + result = self.redis.execute_command( + 'VADD', self.test_key, + 'VALUES', base_dim, + *[str(x) for x in base_vec], + f"{self.test_key}:item:1") + assert result == 1, "VADD with base_dim should succeed" + + too_big_dim = MAX_DIM + 1 + too_big_vec = generate_random_vector(16) + try: + self.redis.execute_command( + 'VSIM', self.test_key, + 'VALUES', too_big_dim, + *[str(x) for x in too_big_vec], + 'COUNT', 1) + assert False, "VSIM with dimension above the limit should fail" + except redis.exceptions.ResponseError as e: + assert "invalid vector specification" in str(e), ( + f"Expected invalid vector specification error in VSIM, got: {e}") + + +class DimensionMaxLimitHugeDimension(TestCase): + def getname(self): + return "[regression] VADD VALUES absurdly large dim rejected" + + def estimated_runtime(self): + return 0.1 + + def test(self): + # Extremely large dimension close to LLONG_MAX should also be rejected safely. + huge_dim = 9223372036854775807 # LLONG_MAX from the original report + try: + self.redis.execute_command( + 'VADD', self.test_key, + 'VALUES', huge_dim, + '0') # Just a dummy value; parseVector should reject based on dimension alone + assert False, "VADD with absurdly large dimension should fail" + except redis.exceptions.ResponseError as e: + assert "invalid vector specification" in str(e), ( + f"Expected invalid vector specification error for huge dim, got: {e}") + diff --git a/modules/vector-sets/vset.c b/modules/vector-sets/vset.c index 46e504b78..b99a610ca 100644 --- a/modules/vector-sets/vset.c +++ b/modules/vector-sets/vset.c @@ -134,6 +134,9 @@ static uint64_t VectorSetTypeNextId = 0; // Default num elements returned by VSIM. #define VSET_DEFAULT_COUNT 10 +// Maximum allowed vector dimension for input vectors and sets. +#define VSET_MAX_VECTOR_DIM (1<<16) + /* ========================== Internal data structure ====================== */ /* Our abstract data type needs a dual representation similar to Redis @@ -408,6 +411,7 @@ float *parseVector(RedisModuleString **argv, int argc, int start_idx, // Must be 4 bytes per component. if (vec_raw_len % 4 || vec_raw_len < 4) return NULL; *dim = vec_raw_len/4; + if (*dim > VSET_MAX_VECTOR_DIM) return NULL; vec = RedisModule_Alloc(vec_raw_len); if (!vec) return NULL; @@ -417,7 +421,7 @@ float *parseVector(RedisModuleString **argv, int argc, int start_idx, if (argc < start_idx + 2) return NULL; // Need at least the dimension. long long vdim; // Vector dimension passed by the user. if (RedisModule_StringToLongLong(argv[start_idx+1],&vdim) - != REDISMODULE_OK || vdim < 1) return NULL; + != REDISMODULE_OK || vdim < 1 || vdim > VSET_MAX_VECTOR_DIM) return NULL; // Check that all the arguments are available. if (argc < start_idx + 2 + vdim) return NULL; @@ -1966,6 +1970,15 @@ void *VectorSetRdbLoad(RedisModuleIO *rdb, int encver) { uint32_t quant_type = hnsw_config & 0xff; uint32_t hnsw_m = (hnsw_config >> 8) & 0xffff; + /* Validate dimension loaded from RDB to enforce invariants and + * avoid absurd allocations or inconsistent state. */ + if (dim == 0 || dim > VSET_MAX_VECTOR_DIM) { + RedisModule_LogIOError(rdb, "warning", + "Invalid vector dimension in RDB: dim=%u (max allowed %u)", + (unsigned)dim, (unsigned)VSET_MAX_VECTOR_DIM); + return NULL; + } + /* Check that the quantization type is correct. Otherwise * return ASAP signaling the error. */ if (quant_type != HNSW_QUANT_NONE && @@ -1988,32 +2001,26 @@ void *VectorSetRdbLoad(RedisModuleIO *rdb, int encver) { if (RedisModule_IsIOError(rdb)) goto ioerr; uint32_t output_dim = dim; - /* Sanity check dimensions to avoid absurd / degenerate matrices. */ - if (input_dim == 0 || output_dim == 0) { + /* Sanity check projection dimensions. */ + if (input_dim == 0 || output_dim == 0 || input_dim > VSET_MAX_VECTOR_DIM || output_dim > input_dim) { RedisModule_LogIOError(rdb, "warning", - "Invalid projection matrix dimensions: input_dim=%u, output_dim=%u", - (unsigned)input_dim, (unsigned)output_dim); + "Invalid projection matrix dimensions: input_dim=%u, output_dim=%u (max allowed %u)", + (unsigned)input_dim, (unsigned)output_dim, + (unsigned)VSET_MAX_VECTOR_DIM); goto ioerr; } /* Check for overflow in matrix_size = sizeof(float) * input_dim * output_dim. */ #if SIZE_MAX == UINT32_MAX - if ((size_t)output_dim > SIZE_MAX / sizeof(float)) { - RedisModule_LogIOError(rdb, "warning", - "Projection matrix size overflow (output_dim too large): input_dim=%u, output_dim=%u", - (unsigned)input_dim, (unsigned)output_dim); - goto ioerr; - } + uint64_t product = (uint64_t) output_dim * (uint64_t) input_dim * sizeof(float); + if (product > SIZE_MAX) { + RedisModule_LogIOError(rdb, "warning", + "Projection matrix size overflow (output_dim too large): input_dim=%u, output_dim=%u", + (unsigned)input_dim, (unsigned)output_dim); + goto ioerr; + } #endif - size_t max_input = SIZE_MAX / (sizeof(float) * (size_t)output_dim); - if ((size_t)input_dim > max_input) { - RedisModule_LogIOError(rdb, "warning", - "Projection matrix size overflow: input_dim=%u, output_dim=%u", - (unsigned)input_dim, (unsigned)output_dim); - goto ioerr; - } - size_t matrix_size = sizeof(float) * (size_t)input_dim * (size_t)output_dim; /* Load projection matrix as a binary blob and validate length. */ From c8f1ec959aaeeea3eb81b3b95faba0a767ad6d87 Mon Sep 17 00:00:00 2001 From: ofiryanai Date: Mon, 24 Nov 2025 11:48:49 +0200 Subject: [PATCH 11/19] Limit VADD REDUCE dim to not exceed original dim * Limit VADD REDUCE dim to not exceed original dim Enforce VADD key [REDUCE dim] to reject dim that is bigger than the HNSW original dim, as dimension reduction makes no sense for reduce_dim > original_dim. This also avoids OOM and possible heap overflow on later allocations using reduce_dim. This should be backported to Redis version 8.0, 8.2 and 8.4. --- .../vector-sets/tests/dimension_validation.py | 30 +++++++++++++++++++ modules/vector-sets/vset.c | 6 ++++ 2 files changed, 36 insertions(+) diff --git a/modules/vector-sets/tests/dimension_validation.py b/modules/vector-sets/tests/dimension_validation.py index f0811529a..7e13f57cf 100644 --- a/modules/vector-sets/tests/dimension_validation.py +++ b/modules/vector-sets/tests/dimension_validation.py @@ -65,3 +65,33 @@ class DimensionValidation(TestCase): assert False, "VSIM with wrong dimension should fail" except redis.exceptions.ResponseError as e: assert "Input dimension mismatch for projection" in str(e), f"Expected dimension mismatch error in VSIM, got: {e}" + +class ReduceDimConstraintValidation(TestCase): + def getname(self): + return "[regression] VADD enforces reduce_dim <= dim" + + def estimated_runtime(self): + return 0.1 + + def test(self): + import struct + + dim = 16 + reduce_dim = dim + 1 # Intentionally larger than dim + + # Build a simple FP32 vector of the given dimension. + vec = [0.0] * dim + vec_bytes = struct.pack(f'{dim}f', *vec) + + try: + self.redis.execute_command( + 'VADD', self.test_key, + 'REDUCE', reduce_dim, + 'FP32', vec_bytes, + f'{self.test_key}:item:reducemismatch') + assert False, "VADD with reduce_dim > dim should fail" + except redis.exceptions.ResponseError as e: + # Same generic validation error path as other vector spec problems. + assert "invalid vector specification" in str(e), ( + f"Expected invalid vector error, got: {e}") + diff --git a/modules/vector-sets/vset.c b/modules/vector-sets/vset.c index b99a610ca..b3b47871b 100644 --- a/modules/vector-sets/vset.c +++ b/modules/vector-sets/vset.c @@ -445,6 +445,12 @@ float *parseVector(RedisModuleString **argv, int argc, int start_idx, return NULL; // Unknown format. } + // reduce_dim must be <= dim + if (reduce_dim && *reduce_dim && *reduce_dim > *dim) { + if (vec) RedisModule_Free(vec); + return NULL; + } + if (consumed_args) *consumed_args = consumed; return vec; } From 54ea50c02926a1462823349b6d5262e635619075 Mon Sep 17 00:00:00 2001 From: dannysheyn <35047315+dannysheyn@users.noreply.github.com> Date: Thu, 14 May 2026 08:05:17 +0300 Subject: [PATCH 12/19] Fix cluster-announce-ip rejecting hostnames (#15188) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes [#15183](https://github.com/redis/redis/issues/15183). ## Motivation Commit [cf668f2c2](https://github.com/redis/redis/commit/cf668f2c2c782ea12dc88458bfd329cf6eb5d658) tightened cluster-announce-ip validation to require a valid IPv4 or IPv6 address, which is a regression for users that legitimately announce a hostname. ## Changes * isValidClusterAnnounceIp() now accepts either: * A valid IPv4/IPv6 address * A valid hostname — same character rules as cluster-announce-hostname, length-bounded by NET_IP_STR_LEN to match the storage buffer. (cherry picked from commit 21f2569f9b577e2acb560f83652e2679c5bd6c92) --- src/config.c | 32 ++++++++++++++-------- tests/unit/cluster/announced-endpoints.tcl | 23 +++++++++++----- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/config.c b/src/config.c index 994f8ae34..c39436b20 100644 --- a/src/config.c +++ b/src/config.c @@ -2429,13 +2429,7 @@ static int isValidAnnouncedNodename(char *val,const char **err) { return 1; } -static int isValidAnnouncedHostname(char *val, const char **err) { - if (strlen(val) >= NET_HOST_STR_LEN) { - *err = "Hostnames must be less than " - STRINGIFY(NET_HOST_STR_LEN) " characters"; - return 0; - } - +static int isValidHostnameChars(char *val, const char **err) { int i = 0; char c; while ((c = val[i])) { @@ -2453,6 +2447,15 @@ static int isValidAnnouncedHostname(char *val, const char **err) { return 1; } +static int isValidAnnouncedHostname(char *val, const char **err) { + if (strlen(val) >= NET_HOST_STR_LEN) { + *err = "Hostnames must be less than " + STRINGIFY(NET_HOST_STR_LEN) " characters"; + return 0; + } + return isValidHostnameChars(val, err); +} + /* Validation function for cluster-announce-ip. * Ensures the IP address is valid and rejects control characters. */ static int isValidClusterAnnounceIp(char *val, const char **err) { @@ -2462,12 +2465,19 @@ static int isValidClusterAnnounceIp(char *val, const char **err) { return 1; } - if (inet_pton(AF_INET, val, buf) != 1 && - inet_pton(AF_INET6, val, buf) != 1) { - *err = "Cluster announce IP must be a valid IPv4 or IPv6 address"; + /* Accept valid IPv4 or IPv6 */ + if (inet_pton(AF_INET, val, buf) == 1 || inet_pton(AF_INET6, val, buf) == 1) { + return 1; + } + /* Also accept valid hostnames, but limited to NET_IP_STR_LEN since + * cluster_announce_ip is stored in a NET_IP_STR_LEN buffer */ + if (strlen(val) >= NET_IP_STR_LEN) { + *err = "Hostnames for cluster-announce-ip must be less than " + STRINGIFY(NET_IP_STR_LEN) " characters"; return 0; } - return 1; + /* Also accept valid hostnames */ + return isValidHostnameChars(val, err); } /* Validate specified string is a valid proc-title-template */ diff --git a/tests/unit/cluster/announced-endpoints.tcl b/tests/unit/cluster/announced-endpoints.tcl index 5784ea6f5..58643a2a7 100644 --- a/tests/unit/cluster/announced-endpoints.tcl +++ b/tests/unit/cluster/announced-endpoints.tcl @@ -75,21 +75,26 @@ start_cluster 2 2 {tags {external:skip cluster}} { # Tests for cluster-announce-ip validation test "cluster-announce-ip validation" { + # Reject control characters in IP-like values catch {R 0 config set cluster-announce-ip "192.168.1.100\nnext"} err - assert_match "*valid IPv4 or IPv6*" $err + assert_match "*alphanumeric*" $err catch {R 0 config set cluster-announce-ip "10.0.0.1\ttab"} err - assert_match "*valid IPv4 or IPv6*" $err + assert_match "*alphanumeric*" $err catch {R 0 config set cluster-announce-ip "1.2.3.4\r\n"} err - assert_match "*valid IPv4 or IPv6*" $err + assert_match "*alphanumeric*" $err - catch {R 0 config set cluster-announce-ip "redis-node-1.example.com"} err - assert_match "*valid IPv4 or IPv6*" $err + # Reject control characters in hostname-like values + catch {R 0 config set cluster-announce-ip "redis-node\nnext"} err + assert_match "*alphanumeric*" $err - catch {R 0 config set cluster-announce-ip "192.168.1"} err - assert_match "*valid IPv4 or IPv6*" $err + catch {R 0 config set cluster-announce-ip "redis-node\ttab"} err + assert_match "*alphanumeric*" $err + catch {R 0 config set cluster-announce-ip "redis-node\r\n"} err + assert_match "*alphanumeric*" $err + # Accept valid IPv4 R 0 config set cluster-announce-ip "192.168.1.100" assert_equal "192.168.1.100" [lindex [R 0 config get cluster-announce-ip] 1] @@ -98,6 +103,10 @@ start_cluster 2 2 {tags {external:skip cluster}} { R 0 config set cluster-announce-ip "2001:db8::1" assert_equal "2001:db8::1" [lindex [R 0 config get cluster-announce-ip] 1] + # Accept valid hostname + R 0 config set cluster-announce-ip "redis-node-1.example.com" + assert_equal "redis-node-1.example.com" [lindex [R 0 config get cluster-announce-ip] 1] + # Can be cleared R 0 config set cluster-announce-ip "" assert_equal "" [lindex [R 0 config get cluster-announce-ip] 1] From dbe5d1e6052d00ff5f9521edbeaf56c4526305d8 Mon Sep 17 00:00:00 2001 From: Scott Lasley Date: Sat, 9 May 2026 07:17:58 -0400 Subject: [PATCH 13/19] Fix invalid repl-diskless-load value in replication test (#15178) Close #15177 Follow [Fix use-after-free when fullsync happens while replica is running a timed out script (CVE-2026-23631)](https://github.com/redis/redis/commit/0cca172a174642bdae03b871615227896274d9bb) Remove the `repl-diskless-load yes` test configuration because this option exists only in the Redis fork and is not available in Redis OSS. (cherry picked from commit 5033e15143d100bdc004bc69399b9d013d52ff23) --- tests/integration/replication.tcl | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index b3020ab5f..05ed71ee0 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -1894,7 +1894,6 @@ foreach type {script function} { $master config set repl-diskless-sync-delay 0 # Set small client output buffer limit to trigger fullsync quickly $master config set client-output-buffer-limit "replica 1k 1k 0" - $replica config set repl-diskless-load yes $replica config set busy-reply-threshold 1 ;# script timeout in 1 ms # Load function From 5faaed6e6bb4096a851a110e8d6751b62f6aac36 Mon Sep 17 00:00:00 2001 From: Tom Gabsow Date: Thu, 14 May 2026 14:11:59 +0300 Subject: [PATCH 14/19] updating Data Types to RC1 (#15201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSON - 8.7.91: * Cherry-pick bug fixes to 8.8 (RedisJSON/RedisJSON#1557) * MOD-6722 Fix mutation ordering for array commands with recursive paths (RedisJSON/RedisJSON#1543) * MOD-7266 make sure to end() deserialization, to disallow trailing c… (RedisJSON/RedisJSON#1554) * MOD-14664 Json Path evaluation - Allow multi-result nodelists in filt… (RedisJSON/RedisJSON#1542) Bloom - 8.7.91 * Lili: RED-180951 RED-181297 RED-184457 RED-184456 RED-184458 RED-184459 fixing bugs and improving the code (RedisBloom/RedisBloom#1002) * MOD-14675 - refresh os list to 8.8 (RedisBloom/RedisBloom#976) Time Series - 8.7.91: * MOD-15262 - Align TimeSeries with the RedisModule_GetUserUserName API… (RedisTimeSeries/RedisTimeSeries#1985) * MOD-14420 fix count reducers return wrong NaN (RedisTimeSeries/RedisTimeSeries#2013) (RedisTimeSeries/RedisTimeSeries#2016) * Lili- RED-180951 RED-180027 fixing bugs and improving the code (RedisTimeSeries/RedisTimeSeries#2003) * MOD-14674 - refresh os list to 8.8 (RedisTimeSeries/RedisTimeSeries#1946) --- modules/redisbloom/Makefile | 2 +- modules/redisjson/Makefile | 2 +- modules/redistimeseries/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/redisbloom/Makefile b/modules/redisbloom/Makefile index f40cc7c1f..2fa608a0e 100644 --- a/modules/redisbloom/Makefile +++ b/modules/redisbloom/Makefile @@ -1,5 +1,5 @@ SRC_DIR = src -MODULE_VERSION = v8.7.90 +MODULE_VERSION = v8.7.91 MODULE_REPO = https://github.com/redisbloom/redisbloom TARGET_MODULE = $(SRC_DIR)/bin/$(FULL_VARIANT)/redisbloom.so diff --git a/modules/redisjson/Makefile b/modules/redisjson/Makefile index 4d13ed7bc..e85e5297d 100644 --- a/modules/redisjson/Makefile +++ b/modules/redisjson/Makefile @@ -1,5 +1,5 @@ SRC_DIR = src -MODULE_VERSION = v8.7.90 +MODULE_VERSION = v8.7.91 MODULE_REPO = https://github.com/redisjson/redisjson TARGET_MODULE = $(SRC_DIR)/bin/$(FULL_VARIANT)/rejson.so diff --git a/modules/redistimeseries/Makefile b/modules/redistimeseries/Makefile index 1bd8b46ca..b5da541dd 100644 --- a/modules/redistimeseries/Makefile +++ b/modules/redistimeseries/Makefile @@ -1,5 +1,5 @@ SRC_DIR = src -MODULE_VERSION = v8.7.90 +MODULE_VERSION = v8.7.91 MODULE_REPO = https://github.com/redistimeseries/redistimeseries TARGET_MODULE = $(SRC_DIR)/bin/$(FULL_VARIANT)/redistimeseries.so From 2ad98a4e3c0d64ec7b4c1d6d748be7b88628431b Mon Sep 17 00:00:00 2001 From: Omer Shadmi <76992134+oshadmi@users.noreply.github.com> Date: Thu, 14 May 2026 14:23:32 +0300 Subject: [PATCH 15/19] Update RediSearch to v8.7.91 (#15199) --- modules/redisearch/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/redisearch/Makefile b/modules/redisearch/Makefile index dee1ef04c..c2cc409be 100644 --- a/modules/redisearch/Makefile +++ b/modules/redisearch/Makefile @@ -1,5 +1,5 @@ SRC_DIR = src -MODULE_VERSION = v8.7.90 +MODULE_VERSION = v8.7.91 MODULE_REPO = https://github.com/redisearch/redisearch TARGET_MODULE = $(SRC_DIR)/bin/$(FULL_VARIANT)/search-community/redisearch.so From c3db5254b76e39963261862135b19848f22d745d Mon Sep 17 00:00:00 2001 From: Lior Kogan Date: Thu, 14 May 2026 14:58:48 +0300 Subject: [PATCH 16/19] Added Array, rename RQE to Search (#15186) --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5ea9241ad..21de64642 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Redis excels in various applications, including: - **Distributed Session Store:** Offers flexible session data modeling (string, JSON, hash). - **Data Structure Server:** Provides low-level data structures (strings, lists, sets, hashes, sorted sets, JSON, etc.) with high-level semantics (counters, queues, leaderboards, rate limiters) and supports transactions & scripting. - **NoSQL Data Store:** Key-value, document, and time series data storage. -- **Search and Query Engine:** Indexing for hash/JSON documents, supporting vector search, full-text search, geospatial queries, ranking, and aggregations via Redis Query Engine. +- **Search and Query Engine:** Indexing for hash/JSON documents, supporting vector search, full-text search, geospatial queries, ranking, and aggregations via Redis Search. - **Event Store & Message Broker:** Implements queues (lists), priority queues (sorted sets), event deduplication (sets), streams, and pub/sub with probabilistic stream processing capabilities. - **Vector Store for GenAI:** Integrates with AI applications (e.g. LangGraph, mem0) for short-term memory, long-term memory, LLM response caching (semantic caching), and retrieval augmented generation (RAG). - **Real-Time Analytics:** Powers personalization, recommendations, fraud detection, and risk assessment. @@ -172,9 +172,10 @@ Redis provides a variety of data types, processing engines, and capabilities to **Important:** Features marked with an asterisk (\*) require Redis to be compiled with the `BUILD_WITH_MODULES=yes` flag when [building Redis from source](#build-redis-from-source) - [**String:**](https://redis.io/docs/latest/develop/data-types/strings) Sequences of bytes, including text, serialized objects, and binary arrays used for caching, counters, and bitwise operations. -- [**JSON:**](https://redis.io/docs/latest/develop/data-types/json/) Nested JSON documents that are indexed and searchable using JSONPath expressions and with [Redis Query Engine](https://redis.io/docs/latest/develop/interact/search-and-query/) +- [**JSON:**](https://redis.io/docs/latest/develop/data-types/json/) Nested JSON documents that are indexed and searchable using JSONPath expressions and with [Redis Search](https://redis.io/docs/latest/develop/ai/search-and-query/) +- [**Array:**](https://redis.io/docs/latest/develop/data-types/arrays/) Sparse, index-addressable collection of string values - [**Hash:**](https://redis.io/docs/latest/develop/data-types/hashes/) Field-value maps used to represent basic objects and store groupings of key-value pairs with support for [hash field expiration (TTL)](https://redis.io/docs/latest/develop/data-types/hashes/#field-expiration) -- [**Redis Query Engine:**](https://redis.io/docs/latest/develop/interact/search-and-query/) Use Redis as a document database, a vector database, a secondary index, and a search engine. Define indexes for hash and JSON documents and then use a rich query language for vector search, full-text search, geospatial queries, and aggregations. +- [**Redis Search:**](https://redis.io/docs/latest/develop/ai/search-and-query/) Use Redis as a document database, a vector database, a secondary index, and a search engine. Define indexes for hash and JSON documents and then use a rich query language for vector search, full-text search, geospatial queries, and aggregations. - [**List:**](https://redis.io/docs/latest/develop/data-types/lists/) Linked lists of string values used as stacks, queues, and for queue management. - [**Set:**](https://redis.io/docs/latest/develop/data-types/sets/) Unordered collection of unique strings used for tracking unique items, relations, and common set operations (intersections, unions, differences). - [**Sorted set:**](https://redis.io/docs/latest/develop/data-types/sorted-sets/) Collection of unique strings ordered by an associated score used for leaderboards and rate limiters. From 2e46d2e7359d71ea0fe4219fd218143b9e7ab03c Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Thu, 14 May 2026 16:31:25 +0300 Subject: [PATCH 17/19] Hold GCRA out of the release (#15191) After introducing GCRA algorithm into redis https://github.com/redis/redis/pull/14826 and subsequent introduction of new RATE_LIMIT object type - https://github.com/redis/redis/pull/14905. It was internally decided not to introduce GCRA into the new release. As still no decision is made on whether it will be kept or not in the future, this PR only makes the code related to GCRA dead - commands are inaccessible and AOF/RDB load+save is disabled. --------- Co-authored-by: debing.sun --- redis.conf | 3 +- src/acl.c | 2 + src/aof.c | 4 + src/commands.def | 58 +------------- src/commands/gcra.json | 92 ---------------------- src/commands/gcrasetvalue.json | 52 ------------ src/config.c | 6 +- src/db.c | 8 +- src/debug.c | 6 ++ src/defrag.c | 2 + src/gcra.c | 4 + src/module.c | 2 + src/notify.c | 4 + src/object.c | 20 +++++ src/object.h | 2 +- src/rdb.c | 6 ++ src/rdb.h | 10 ++- src/redis-check-rdb.c | 4 +- src/redismodule.h | 11 ++- src/server.h | 23 ++++-- tests/assets/array-32bit.rdb | Bin 808 -> 808 bytes tests/integration/corrupt-dump-fuzzer.tcl | 2 + tests/support/util.tcl | 7 +- tests/unit/gcra.tcl | 6 ++ utils/generate-command-code.py | 2 + 25 files changed, 117 insertions(+), 219 deletions(-) delete mode 100644 src/commands/gcra.json delete mode 100644 src/commands/gcrasetvalue.json diff --git a/redis.conf b/redis.conf index 3688ae5e1..9151c8fc8 100644 --- a/redis.conf +++ b/redis.conf @@ -2051,7 +2051,6 @@ latency-monitor-threshold 0 # (Note: not included in the 'A' class) # c Type-changed events generated every time a key's type changes # (Note: not included in the 'A' class) -# r rate limit event # S Subkeyspace events, published with __subkeyspace@__: prefix. # T Subkeyevent events, published with __subkeyevent@__: prefix. # I Subkeyspaceitem events, published per subkey with @@ -2059,7 +2058,7 @@ latency-monitor-threshold 0 # V Subkeyspaceevent events, published with # __subkeyspaceevent@__:| prefix. # A Alias for g$lshzxetad, so that the "AKE" string means all the events -# except key-miss, new key, overwritten, type-changed and rate-limit. +# except key-miss, new key, overwritten and type-changed. # # The "notify-keyspace-events" takes as argument a string that is composed # of zero or multiple characters. The empty string means that notifications diff --git a/src/acl.c b/src/acl.c index 95f749299..177077d45 100644 --- a/src/acl.c +++ b/src/acl.c @@ -71,7 +71,9 @@ struct ACLCategoryItem { {"connection", ACL_CATEGORY_CONNECTION}, {"transaction", ACL_CATEGORY_TRANSACTION}, {"scripting", ACL_CATEGORY_SCRIPTING}, +#ifdef ENABLE_GCRA {"ratelimit", ACL_CATEGORY_RATE_LIMIT}, +#endif {NULL,0} /* Terminator. */ }; diff --git a/src/aof.c b/src/aof.c index 8b6eb5709..9e55a78b7 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2467,6 +2467,7 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { return 1; } +#ifdef ENABLE_GCRA int rewriteGCRAObject(rio *r, robj *key, robj *o) { long long val; getLongLongFromGCRAObject(o, &val); @@ -2478,6 +2479,7 @@ int rewriteGCRAObject(rio *r, robj *key, robj *o) { if (rioWriteBulkLongLong(r,val) == 0) return 0; return 1; } +#endif /* Call the module type callback in order to rewrite a data type * that is exported by a module and is not handled by Redis itself. @@ -2644,8 +2646,10 @@ int rewriteObject(rio *r, robj *key, robj *o, int dbid, long long expiretime) { if (rewriteHashObject(r,key,o) == 0) return C_ERR; } else if (o->type == OBJ_STREAM) { if (rewriteStreamObject(r,key,o) == 0) return C_ERR; +#ifdef ENABLE_GCRA } else if (o->type == OBJ_GCRA) { if (rewriteGCRAObject(r,key,o) == 0) return C_ERR; +#endif } else if (o->type == OBJ_ARRAY) { if (rewriteArrayObject(r,key,o) == 0) return C_ERR; } else if (o->type == OBJ_MODULE) { diff --git a/src/commands.def b/src/commands.def index 2aff10043..9b5692aa3 100644 --- a/src/commands.def +++ b/src/commands.def @@ -26,7 +26,9 @@ const char *COMMAND_GROUP_STR[] = { "bitmap", "array", "module", +#ifdef ENABLE_GCRA "rate_limit" +#endif }; const char *commandGroupStr(int index) { @@ -5910,59 +5912,6 @@ struct COMMAND_ARG UNSUBSCRIBE_Args[] = { {MAKE_ARG("channel",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)}, }; -/********** GCRA ********************/ - -#ifndef SKIP_CMD_HISTORY_TABLE -/* GCRA history */ -#define GCRA_History NULL -#endif - -#ifndef SKIP_CMD_TIPS_TABLE -/* GCRA tips */ -#define GCRA_Tips NULL -#endif - -#ifndef SKIP_CMD_KEY_SPECS_TABLE -/* GCRA key specs */ -keySpec GCRA_Keyspecs[1] = { -{NULL,CMD_KEY_RW|CMD_KEY_ACCESS|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} -}; -#endif - -/* GCRA argument table */ -struct COMMAND_ARG GCRA_Args[] = { -{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("max-burst",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("tokens-per-period",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("period",ARG_TYPE_DOUBLE,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("count",ARG_TYPE_INTEGER,-1,"TOKENS",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, -}; - -/********** GCRASETVALUE ********************/ - -#ifndef SKIP_CMD_HISTORY_TABLE -/* GCRASETVALUE history */ -#define GCRASETVALUE_History NULL -#endif - -#ifndef SKIP_CMD_TIPS_TABLE -/* GCRASETVALUE tips */ -#define GCRASETVALUE_Tips NULL -#endif - -#ifndef SKIP_CMD_KEY_SPECS_TABLE -/* GCRASETVALUE key specs */ -keySpec GCRASETVALUE_Keyspecs[1] = { -{NULL,CMD_KEY_OW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} -}; -#endif - -/* GCRASETVALUE argument table */ -struct COMMAND_ARG GCRASETVALUE_Args[] = { -{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("tat",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -}; - /********** EVAL ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -12556,9 +12505,6 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("subscribe","Listens for messages published to channels.","O(N) where N is the number of channels to subscribe to.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SUBSCRIBE_History,0,SUBSCRIBE_Tips,0,subscribeCommand,-2,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL|CMD_DENYOOM,0,SUBSCRIBE_Keyspecs,0,NULL,1),.args=SUBSCRIBE_Args}, {MAKE_CMD("sunsubscribe","Stops listening to messages posted to shard channels.","O(N) where N is the number of shard channels to unsubscribe.","7.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,SUNSUBSCRIBE_History,0,SUNSUBSCRIBE_Tips,0,sunsubscribeCommand,-1,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,SUNSUBSCRIBE_Keyspecs,1,NULL,1),.args=SUNSUBSCRIBE_Args}, {MAKE_CMD("unsubscribe","Stops listening to messages posted to channels.","O(N) where N is the number of channels to unsubscribe.","2.0.0",CMD_DOC_NONE,NULL,NULL,"pubsub",COMMAND_GROUP_PUBSUB,UNSUBSCRIBE_History,0,UNSUBSCRIBE_Tips,0,unsubscribeCommand,-1,CMD_PUBSUB|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,UNSUBSCRIBE_Keyspecs,0,NULL,1),.args=UNSUBSCRIBE_Args}, -/* rate_limit */ -{MAKE_CMD("gcra","Rate limit via GCRA (Generic Cell Rate Algorithm).","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"rate_limit",COMMAND_GROUP_RATE_LIMIT,GCRA_History,0,GCRA_Tips,0,gcraCommand,-5,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_RATE_LIMIT,GCRA_Keyspecs,1,NULL,5),.args=GCRA_Args}, -{MAKE_CMD("gcrasetvalue","An internal command for recording a GCRA TAT value during AOF rewrite and replication.","O(1)","8.8.0",CMD_DOC_NONE,NULL,NULL,"rate_limit",COMMAND_GROUP_RATE_LIMIT,GCRASETVALUE_History,0,GCRASETVALUE_Tips,0,gcraSetValueCommand,3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_RATE_LIMIT,GCRASETVALUE_Keyspecs,1,NULL,2),.args=GCRASETVALUE_Args}, /* scripting */ {MAKE_CMD("eval","Executes a server-side Lua script.","Depends on the script that is executed.","2.6.0",CMD_DOC_NONE,NULL,NULL,"scripting",COMMAND_GROUP_SCRIPTING,EVAL_History,0,EVAL_Tips,0,evalCommand,-3,CMD_NOSCRIPT|CMD_SKIP_MONITOR|CMD_MAY_REPLICATE|CMD_NO_MANDATORY_KEYS|CMD_STALE,ACL_CATEGORY_SCRIPTING,EVAL_Keyspecs,1,evalGetKeys,4),.args=EVAL_Args}, {MAKE_CMD("evalsha","Executes a server-side Lua script by SHA1 digest.","Depends on the script that is executed.","2.6.0",CMD_DOC_NONE,NULL,NULL,"scripting",COMMAND_GROUP_SCRIPTING,EVALSHA_History,0,EVALSHA_Tips,0,evalShaCommand,-3,CMD_NOSCRIPT|CMD_SKIP_MONITOR|CMD_MAY_REPLICATE|CMD_NO_MANDATORY_KEYS|CMD_STALE,ACL_CATEGORY_SCRIPTING,EVALSHA_Keyspecs,1,evalGetKeys,4),.args=EVALSHA_Args}, diff --git a/src/commands/gcra.json b/src/commands/gcra.json deleted file mode 100644 index 6980af1ac..000000000 --- a/src/commands/gcra.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "GCRA": { - "summary": "Rate limit via GCRA (Generic Cell Rate Algorithm).", - "complexity": "O(1)", - "group": "rate_limit", - "since": "8.8.0", - "arity": -5, - "function": "gcraCommand", - "command_flags": [ - "WRITE", - "DENYOOM", - "FAST" - ], - "acl_categories": [ - "RATE_LIMIT" - ], - "key_specs": [ - { - "flags": [ - "RW", - "ACCESS", - "UPDATE" - ], - "begin_search": { - "index": { - "pos": 1 - } - }, - "find_keys": { - "range": { - "lastkey": 0, - "step": 1, - "limit": 0 - } - } - } - ], - "reply_schema": { - "type": "array", - "minItems": 5, - "maxItems": 5, - "description": "Rate limiting result", - "items": [ - { - "type": "integer", - "description": "Limited: 0 if allowed, 1 if rate limited" - }, - { - "type": "integer", - "description": "Max request tokens: always equal to max_burst+1" - }, - { - "type": "integer", - "description": "Number of tokens available immediately" - }, - { - "type": "integer", - "description": "Retry after: seconds after which the caller should retry. Always -1 if not limited" - }, - { - "type": "integer", - "description": "Full burst after: seconds after which a full burst will be allowed" - } - ] - }, - "arguments": [ - { - "name": "key", - "type": "key", - "key_spec_index": 0 - }, - { - "name": "max-burst", - "type": "integer" - }, - { - "name": "tokens-per-period", - "type": "integer" - }, - { - "name": "period", - "type": "double" - }, - { - "name": "count", - "type": "integer", - "token": "TOKENS", - "optional": true - } - ] - } -} diff --git a/src/commands/gcrasetvalue.json b/src/commands/gcrasetvalue.json deleted file mode 100644 index 5cce15cf4..000000000 --- a/src/commands/gcrasetvalue.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "GCRASETVALUE": { - "summary": "An internal command for recording a GCRA TAT value during AOF rewrite and replication.", - "complexity": "O(1)", - "group": "rate_limit", - "since": "8.8.0", - "arity": 3, - "function": "gcraSetValueCommand", - "command_flags": [ - "WRITE", - "DENYOOM", - "FAST" - ], - "acl_categories": [ - "RATE_LIMIT" - ], - "key_specs": [ - { - "flags": [ - "OW", - "UPDATE" - ], - "begin_search": { - "index": { - "pos": 1 - } - }, - "find_keys": { - "range": { - "lastkey": 0, - "step": 1, - "limit": 0 - } - } - } - ], - "reply_schema": { - "const": "OK" - }, - "arguments": [ - { - "name": "key", - "type": "key", - "key_spec_index": 0 - }, - { - "name": "tat", - "type": "integer" - } - ] - } -} diff --git a/src/config.c b/src/config.c index c39436b20..59bef0a1b 100644 --- a/src/config.c +++ b/src/config.c @@ -2972,7 +2972,11 @@ static int setConfigNotifyKeyspaceEventsOption(standardConfig *config, sds *argv } int flags = keyspaceEventsStringToFlags(argv[0]); if (flags == -1) { - *err = "Invalid event class character. Use 'Ag$lshzxeKEtmdnocrSTIV'."; +#ifdef ENABLE_GCRA + *err = "Invalid event class character. Use 'Ag$lshzxeKEtmdnocraSTIV'."; +#else + *err = "Invalid event class character. Use 'Ag$lshzxeKEtmdnocaSTIV'."; +#endif return 0; } server.notify_keyspace_events = flags; diff --git a/src/db.c b/src/db.c index e2dd50b50..87881a991 100644 --- a/src/db.c +++ b/src/db.c @@ -1758,8 +1758,10 @@ char *obj_type_name[OBJ_TYPE_MAX] = { "hash", NULL, /* module type is special */ "stream", - "gcra", - "array" + "array", +#ifdef ENABLE_GCRA + "gcra" +#endif }; /* Helper function to get type from a string in scan commands */ @@ -2434,7 +2436,9 @@ void copyCommand(client *c) { case OBJ_ZSET: newobj = zsetDup(o); break; case OBJ_HASH: newobj = hashTypeDup(o, &minHashExpire); break; case OBJ_STREAM: newobj = streamDup(o); break; +#ifdef ENABLE_GCRA case OBJ_GCRA: newobj = gcraDup(o); break; +#endif case OBJ_MODULE: newobj = moduleTypeDupOrReply(c, key, newkey, dst->id, o); if (!newobj) return; diff --git a/src/debug.c b/src/debug.c index 9853d6c65..e14f2a52b 100644 --- a/src/debug.c +++ b/src/debug.c @@ -123,6 +123,7 @@ void mixStringObjectDigest(unsigned char *digest, robj *o) { decrRefCount(o); } +#ifdef ENABLE_GCRA void mixGCRAObjectDigest(unsigned char *digest, robj *o) { char buf[LONG_STR_SIZE]; long long val; @@ -130,6 +131,7 @@ void mixGCRAObjectDigest(unsigned char *digest, robj *o) { int len = ll2string(buf, sizeof(buf), val); mixDigest(digest,buf,len); } +#endif /* This function computes the digest of a data structure stored in the * object 'o'. It is the core of the DEBUG DIGEST command: when taking the @@ -263,8 +265,10 @@ void xorObjectDigest(redisDb *db, robj *keyobj, unsigned char *digest, robj *o) } } streamIteratorStop(&si); +#ifdef ENABLE_GCRA } else if (o->type == OBJ_GCRA) { mixGCRAObjectDigest(digest, o); +#endif } else if (o->type == OBJ_MODULE) { RedisModuleDigest md = {{0},{0},keyobj,db->id}; moduleValue *mv = o->ptr; @@ -1327,9 +1331,11 @@ void serverLogObjectDebugInfo(const robj *o) { serverLog(LL_WARNING,"Skiplist level: %d", (int) ((const zset*)o->ptr)->zsl->level); } else if (o->type == OBJ_STREAM) { serverLog(LL_WARNING,"Stream size: %d", (int) streamLength(o)); +#ifdef ENABLE_GCRA } else if (o->type == OBJ_GCRA) { #if UINTPTR_MAX == 0xffffffffffffffff serverLog(LL_WARNING, "GCRA object: %lld", (long long)o->ptr); +#endif #endif } #endif diff --git a/src/defrag.c b/src/defrag.c index 010d4de23..913e457c2 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -1189,12 +1189,14 @@ void defragKey(defragKeysCtx *ctx, dictEntry *de, dictEntryLink link) { } } else if (ob->type == OBJ_STREAM) { defragStream(ctx, ob); +#ifdef ENABLE_GCRA } else if (ob->type == OBJ_GCRA) { /* GCRA object is just an allocation to a long long value */ #if UINTPTR_MAX == 0xffffffff void *newptr, *ptr = ob->ptr; if ((newptr = activeDefragAlloc(ptr))) ob->ptr = newptr; +#endif #endif } else if (ob->type == OBJ_MODULE) { defragModule(ctx,db, ob); diff --git a/src/gcra.c b/src/gcra.c index 488fad5ce..a6b738824 100644 --- a/src/gcra.c +++ b/src/gcra.c @@ -9,6 +9,8 @@ #include "server.h" #include +#ifdef ENABLE_GCRA + /* GCRA algorithm for rate limiting. * Implementation is heavily based on the implementation of (redis-cell) * [https://github.com/brandur/redis-cell] by (brandur)[https://github.com/brandur]. @@ -278,3 +280,5 @@ robj *gcraDup(robj *o) { getLongLongFromGCRAObject(o, &val); return createGCRAObject(val); } + +#endif /* ENABLE_GCRA */ diff --git a/src/module.c b/src/module.c index 0b1fb131d..38a6ad388 100644 --- a/src/module.c +++ b/src/module.c @@ -4254,7 +4254,9 @@ int RM_KeyType(RedisModuleKey *key) { case OBJ_HASH: return REDISMODULE_KEYTYPE_HASH; case OBJ_MODULE: return REDISMODULE_KEYTYPE_MODULE; case OBJ_STREAM: return REDISMODULE_KEYTYPE_STREAM; +#ifdef ENABLE_GCRA case OBJ_GCRA: return REDISMODULE_KEYTYPE_GCRA; +#endif case OBJ_ARRAY: return REDISMODULE_KEYTYPE_ARRAY; default: return REDISMODULE_KEYTYPE_EMPTY; } diff --git a/src/notify.c b/src/notify.c index c8e884204..5c8b188fa 100644 --- a/src/notify.c +++ b/src/notify.c @@ -41,7 +41,9 @@ int keyspaceEventsStringToFlags(char *classes) { case 'n': flags |= NOTIFY_NEW; break; case 'o': flags |= NOTIFY_OVERWRITTEN; break; case 'c': flags |= NOTIFY_TYPE_CHANGED; break; +#ifdef ENABLE_GCRA case 'r': flags |= NOTIFY_RATE_LIMIT; break; +#endif case 'S': flags |= NOTIFY_SUBKEYSPACE; break; case 'T': flags |= NOTIFY_SUBKEYEVENT; break; case 'I': flags |= NOTIFY_SUBKEYSPACEITEM; break; @@ -77,7 +79,9 @@ sds keyspaceEventsFlagsToString(int flags) { if (flags & NOTIFY_NEW) res = sdscatlen(res,"n",1); if (flags & NOTIFY_OVERWRITTEN) res = sdscatlen(res,"o",1); if (flags & NOTIFY_TYPE_CHANGED) res = sdscatlen(res,"c",1); +#ifdef ENABLE_GCRA if (flags & NOTIFY_RATE_LIMIT) res = sdscatlen(res,"r",1); +#endif } if (flags & NOTIFY_KEYSPACE) res = sdscatlen(res,"K",1); if (flags & NOTIFY_KEYEVENT) res = sdscatlen(res,"E",1); diff --git a/src/object.c b/src/object.c index 4ba1b4978..697ed6e39 100644 --- a/src/object.c +++ b/src/object.c @@ -514,6 +514,7 @@ robj *createStreamObject(void) { return o; } +#ifdef ENABLE_GCRA robj *createGCRAObject(long long value) { /* NOTE: for 32-bit systems we can't use integer encoding (as OBJ_STRING does) * as the GCRA object is a unixtime value in microseconds, which as of the @@ -530,6 +531,7 @@ robj *createGCRAObject(long long value) { o->encoding = OBJ_ENCODING_INT; return o; } +#endif robj *createArrayObject(void) { redisArray *ar = arNew(); @@ -610,6 +612,7 @@ void freeStreamObject(robj *o) { freeStream(o->ptr); } +#ifdef ENABLE_GCRA void freeGCRAObject(robj *o) { #if UINTPTR_MAX == 0xffffffff zfree(o->ptr); @@ -617,6 +620,7 @@ void freeGCRAObject(robj *o) { (void)o; #endif } +#endif void freeArrayObject(robj *o) { arFree(o->ptr); @@ -673,7 +677,9 @@ void decrRefCount(robj *o) { case OBJ_HASH: freeHashObject(o); break; case OBJ_MODULE: freeModuleObject(o); break; case OBJ_STREAM: freeStreamObject(o); break; +#ifdef ENABLE_GCRA case OBJ_GCRA: freeGCRAObject(o); break; +#endif case OBJ_ARRAY: freeArrayObject(o); break; default: serverPanic("Unknown object type"); break; } @@ -827,12 +833,14 @@ void dismissArrayObject(robj *o, size_t size_hint) { arDismiss(o->ptr, size_hint); } +#ifdef ENABLE_GCRA void dismissGCRAObject(robj *o, size_t size_hint) { /* GCRA is a single allocation of a long long thus way smaller than a * page-size. The dismiss mechanism is not needed for it - hence NOOP.*/ (void)o; (void)size_hint; } +#endif /* When creating a snapshot in a fork child process, the main process and child * process share the same physical memory pages, and if / when the parent @@ -862,7 +870,9 @@ void dismissObject(robj *o, size_t size_hint) { case OBJ_ZSET: dismissZsetObject(o, size_hint); break; case OBJ_HASH: dismissHashObject(o, size_hint); break; case OBJ_STREAM: dismissStreamObject(o, size_hint); break; +#ifdef ENABLE_GCRA case OBJ_GCRA: dismissGCRAObject(o, size_hint); break; +#endif case OBJ_ARRAY: dismissArrayObject(o, size_hint); break; default: break; } @@ -985,7 +995,9 @@ size_t getObjectLength(robj *o) { case OBJ_ZSET: return zsetLength(o); case OBJ_HASH: return hashTypeLength(o, 0); case OBJ_STREAM: return streamLength(o); +#ifdef ENABLE_GCRA case OBJ_GCRA: return gcraObjectLength(o); +#endif case OBJ_ARRAY: return arCount(o->ptr); default: return 0; } @@ -1195,6 +1207,7 @@ int getLongLongFromObject(robj *o, long long *target) { return C_OK; } +#ifdef ENABLE_GCRA int getLongLongFromGCRAObject(robj *o, long long *target) { long long res; serverAssertWithInfo(NULL, o, o->type == OBJ_GCRA); @@ -1210,6 +1223,7 @@ int getLongLongFromGCRAObject(robj *o, long long *target) { *target = res; return C_OK; } +#endif int getLongLongFromObjectOrReply(client *c, robj *o, long long *target, const char *msg) { long long value; @@ -1303,7 +1317,9 @@ size_t kvobjComputeSize(robj *key, kvobj *o, size_t sample_size, int dbid) { o->type == OBJ_ZSET || o->type == OBJ_HASH || o->type == OBJ_STREAM || +#ifdef ENABLE_GCRA o->type == OBJ_GCRA || +#endif o->type == OBJ_ARRAY) { return kvobjAllocSize(o); @@ -1330,8 +1346,10 @@ size_t kvobjAllocSize(kvobj *o) { } else if (o->type == OBJ_STREAM) { stream *s = o->ptr; asize += s->alloc_size; +#ifdef ENABLE_GCRA } else if (o->type == OBJ_GCRA) { asize += gcraTypeAllocSize(o); +#endif } else if (o->type == OBJ_ARRAY) { redisArray *ar = o->ptr; asize += ar->alloc_size; @@ -1341,6 +1359,7 @@ size_t kvobjAllocSize(kvobj *o) { return asize; } +#ifdef ENABLE_GCRA size_t gcraTypeAllocSize(robj *o) { (void)o; #if UINTPTR_MAX == 0xffffffff @@ -1357,6 +1376,7 @@ size_t gcraObjectLength(robj *o) { (void)o; return 1; } +#endif /* Release data obtained with getMemoryOverheadData(). */ void freeMemoryOverheadData(struct redisMemOverhead *mh) { diff --git a/src/object.h b/src/object.h index 9fbf0f04e..35cd40a3c 100644 --- a/src/object.h +++ b/src/object.h @@ -5,7 +5,7 @@ * values of different logical types (strings, lists, sets, hashes, sorted sets, * streams, modules, ...). It contains: * - type: one of OBJ_STRING, OBJ_LIST, OBJ_SET, OBJ_ZSET, OBJ_HASH, OBJ_STREAM, - * OBJ_GCRA, OBJ_MODULE, ... + * OBJ_MODULE, ... * - encoding: an implementation detail of how the value is represented in * memory for the given type (see OBJ_ENCODING_* below). For example, * strings may be RAW/EMBSTR/INT, sets may be INTSET or HT, etc. diff --git a/src/rdb.c b/src/rdb.c index 31e813199..9793c2672 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -722,8 +722,10 @@ int rdbSaveObjectType(rio *rdb, robj *o) { serverPanic("Unknown hash encoding"); case OBJ_STREAM: return rdbSaveType(rdb,RDB_TYPE_STREAM_LISTPACKS_5); +#ifdef ENABLE_GCRA case OBJ_GCRA: return rdbSaveType(rdb,RDB_TYPE_GCRA); +#endif case OBJ_MODULE: return rdbSaveType(rdb,RDB_TYPE_MODULE_2); case OBJ_ARRAY: @@ -1474,11 +1476,13 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { /* Save the all-time count of duplicate IIDs detected. */ if ((n = rdbSaveLen(rdb,s->iids_duplicates)) == -1) return -1; nwritten += n; +#ifdef ENABLE_GCRA } else if (o->type == OBJ_GCRA) { long long t; getLongLongFromGCRAObject(o, &t); if ((n = rdbSaveLen(rdb,t)) == -1) return -1; nwritten += n; +#endif } else if (o->type == OBJ_MODULE) { /* Save a module-specific value. */ RedisModuleIO io; @@ -3769,6 +3773,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) return NULL; } o = createModuleObject(mt, ptr); +#ifdef ENABLE_GCRA } else if (rdbtype == RDB_TYPE_GCRA) { uint64_t time = rdbLoadLen(rdb, NULL); if (time == RDB_LENERR || time > LLONG_MAX) { @@ -3776,6 +3781,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) return NULL; } o = createGCRAObject((long long)time); +#endif } else if (rdbtype == RDB_TYPE_ARRAY) { /* Load array value. We only persist elements and insert_idx - no * implementation details. Arrays use current ar_slice_size config. */ diff --git a/src/rdb.h b/src/rdb.h index 159992dc0..7e49ddff0 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -80,12 +80,18 @@ #define RDB_TYPE_HASH_LISTPACK_EX 25 /* Hash LP with HFEs. Attach min TTL at start */ #define RDB_TYPE_STREAM_LISTPACKS_4 26 /* Stream with IDMP support */ #define RDB_TYPE_STREAM_LISTPACKS_5 27 /* Stream with XNACK support (NACKed entries) */ -#define RDB_TYPE_GCRA 28 /* GCRA object */ -#define RDB_TYPE_ARRAY 29 /* Array data type */ +#define RDB_TYPE_ARRAY 28 /* Array data type */ +#ifdef ENABLE_GCRA +#define RDB_TYPE_GCRA 29 /* GCRA object */ +#endif /* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType(), and rdb_type_string[] */ /* Test if a type is an object type. */ +#ifdef ENABLE_GCRA #define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 29)) +#else +#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 28)) +#endif /* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */ #define RDB_OPCODE_KEY_META 243 /* Key metadata (module metadata classes). */ diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index d3bbe4b40..e4c10216d 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -88,8 +88,10 @@ char *rdb_type_string[] = { "hash-listpack-md", "stream-v4", "stream-v5", - "gcra", "array", +#ifdef ENABLE_GCRA + "gcra", +#endif }; /* Show a few stats collected into 'rdbstate' */ diff --git a/src/redismodule.h b/src/redismodule.h index d78b0e26d..f0d9e8aa6 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -89,8 +89,7 @@ typedef long long ustime_t; #define REDISMODULE_KEYTYPE_ZSET 5 #define REDISMODULE_KEYTYPE_MODULE 6 #define REDISMODULE_KEYTYPE_STREAM 7 -#define REDISMODULE_KEYTYPE_GCRA 8 -#define REDISMODULE_KEYTYPE_ARRAY 9 +#define REDISMODULE_KEYTYPE_ARRAY 8 /* Reply types. */ #define REDISMODULE_REPLY_UNKNOWN -1 @@ -249,18 +248,24 @@ This flag should not be used directly by the module. #define REDISMODULE_NOTIFY_OVERWRITTEN (1<<15) /* o, key overwrite notification */ #define REDISMODULE_NOTIFY_TYPE_CHANGED (1<<16) /* c, key type changed notification */ #define REDISMODULE_NOTIFY_KEY_TRIMMED (1<<17) /* module only key space notification, indicates a key trimmed during slot migration */ -#define REDISMODULE_NOTIFY_RATE_LIMIT (1<<18) /* r, rate limit event */ #define REDISMODULE_NOTIFY_SUBKEYSPACE (1<<19) /* S */ #define REDISMODULE_NOTIFY_SUBKEYEVENT (1<<20) /* T */ #define REDISMODULE_NOTIFY_SUBKEYSPACEITEM (1<<21) /* I */ #define REDISMODULE_NOTIFY_SUBKEYSPACEEVENT (1<<22) /* V */ #define REDISMODULE_NOTIFY_ARRAY (1<<23) /* a, array key space notification */ +#ifdef ENABLE_GCRA +#define REDISMODULE_NOTIFY_RATE_LIMIT (1<<24) /* r, rate limit event */ +#endif /* Next notification flag, must be updated when adding new flags above! This flag should not be used directly by the module. * Use RedisModule_GetKeyspaceNotificationFlagsAll instead. */ +#ifdef ENABLE_GCRA +#define _REDISMODULE_NOTIFY_NEXT (1<<25) +#else #define _REDISMODULE_NOTIFY_NEXT (1<<24) +#endif /* Delivery flags for RM_SubscribeToKeyspaceEventsWithSubkeys. * These are passed in the 'flags' parameter, not in 'types'. */ diff --git a/src/server.h b/src/server.h index 13d05ce0e..2a6fa5fcb 100644 --- a/src/server.h +++ b/src/server.h @@ -288,8 +288,10 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define ACL_CATEGORY_CONNECTION (1ULL<<18) #define ACL_CATEGORY_TRANSACTION (1ULL<<19) #define ACL_CATEGORY_SCRIPTING (1ULL<<20) -#define ACL_CATEGORY_RATE_LIMIT (1ULL<<21) -#define ACL_CATEGORY_ARRAY (1ULL<<22) +#define ACL_CATEGORY_ARRAY (1ULL<<21) +#ifdef ENABLE_GCRA +#define ACL_CATEGORY_RATE_LIMIT (1ULL<<22) +#endif /* Key-spec flags * * -------------- */ @@ -798,12 +800,14 @@ typedef enum { #define NOTIFY_OVERWRITTEN (1<<15) /* o, key overwrite notification (Note: excluded from NOTIFY_ALL) */ #define NOTIFY_TYPE_CHANGED (1<<16) /* c, key type changed notification (Note: excluded from NOTIFY_ALL) */ #define NOTIFY_KEY_TRIMMED (1<<17) /* module only key space notification, indicates a key trimmed during slot migration */ -#define NOTIFY_RATE_LIMIT (1<<18) /* r, notify rate limit event (Note: excluded from NOTIFY_ALL)*/ #define NOTIFY_SUBKEYSPACE (1<<19) /* S, subkey-level keyspace notification */ #define NOTIFY_SUBKEYEVENT (1<<20) /* T, subkey-level keyevent notification */ #define NOTIFY_SUBKEYSPACEITEM (1<<21) /* I, subkey-level notification per item: channel=key\nsubkey */ #define NOTIFY_SUBKEYSPACEEVENT (1<<22) /* V, subkey-level notification: channel=event|key */ #define NOTIFY_ARRAY (1<<23) /* a, array notification */ +#ifdef ENABLE_GCRA +#define NOTIFY_RATE_LIMIT (1<<24) /* r, notify rate limit event (Note: excluded from NOTIFY_ALL)*/ +#endif #define NOTIFY_ALL (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | NOTIFY_EVICTED | NOTIFY_STREAM | NOTIFY_MODULE | NOTIFY_ARRAY) /* A flag */ /* Using the following macro you can run code inside serverCron() with the @@ -866,11 +870,18 @@ typedef enum { * by a 64 bit module type ID, which has a 54 bits module-specific signature * in order to dispatch the loading to the right module, plus a 10 bits * encoding version. */ +/* Code related to GCRA is disabled by default. + * Build with -DENABLE_GCRA to compile it back in. */ + #define OBJ_MODULE 5 /* Module object. */ #define OBJ_STREAM 6 /* Stream object. */ -#define OBJ_GCRA 7 /* GCRA object. */ -#define OBJ_ARRAY 8 /* Array object. */ +#define OBJ_ARRAY 7 /* Array object. */ +#ifdef ENABLE_GCRA +#define OBJ_GCRA 8 /* GCRA object. */ #define OBJ_TYPE_MAX 9 /* Maximum number of object types */ +#else +#define OBJ_TYPE_MAX 8 /* Maximum number of object types */ +#endif /* NOTE: adding a new object requires changes in the following places: * - rdb.c - save/load (also bump RDB_VERSION if needed) @@ -2811,7 +2822,9 @@ typedef enum { COMMAND_GROUP_BITMAP, COMMAND_GROUP_ARRAY, COMMAND_GROUP_MODULE, +#ifdef ENABLE_GCRA COMMAND_GROUP_RATE_LIMIT, +#endif } redisCommandGroup; typedef void redisCommandProc(client *c); diff --git a/tests/assets/array-32bit.rdb b/tests/assets/array-32bit.rdb index 2e997c968e9d4ea2e8239f02e097f13329b0a4c5..94ff98ea388f91e17a4a5557936c77c3c9562c88 100644 GIT binary patch delta 97 zcmZ3%wt|f_$koL&*ucQhWFn`o>Yrn*S-&_+i&Il{b5nBQsgunZGzni&Il{b5nBj)8-wiR!%Q)ZvJ>xiGRjU?VVVadKQkHt1)>{)%yh*kZDorA@ diff --git a/tests/integration/corrupt-dump-fuzzer.tcl b/tests/integration/corrupt-dump-fuzzer.tcl index f2fa8bc0f..8bd170027 100644 --- a/tests/integration/corrupt-dump-fuzzer.tcl +++ b/tests/integration/corrupt-dump-fuzzer.tcl @@ -68,7 +68,9 @@ proc generate_types {} { # create other non-collection types r incr int r set string str +if 0 { r gcra gcra 10 5 60000 +} # create bigger objects with 10 items (more than a single ziplist / listpack) generate_collections big 10 diff --git a/tests/support/util.tcl b/tests/support/util.tcl index f74bc7e87..e46da150a 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -801,9 +801,12 @@ proc generate_fuzzy_traffic_on_key {key type duration} { set set_commands {SADD SCARD SDIFF SDIFFSTORE SINTER SINTERSTORE SISMEMBER SMEMBERS SMOVE SPOP SRANDMEMBER SREM SSCAN SUNION SUNIONSTORE} set stream_commands {XACK XADD XCLAIM XDEL XGROUP XINFO XLEN XPENDING XRANGE XREAD XREADGROUP XREVRANGE XTRIM XDELEX XACKDEL XNACK} set vset_commands {VADD VREM} - set gcra_commands {GCRA} set array_commands {ARSET ARGET ARDEL ARCOUNT ARMSET ARMGET ARGETRANGE ARDELRANGE ARINFO} - set commands [dict create string $string_commands hash $hash_commands zset $zset_commands list $list_commands set $set_commands stream $stream_commands vectorset $vset_commands gcra $gcra_commands array $array_commands] + set commands [dict create string $string_commands hash $hash_commands zset $zset_commands list $list_commands set $set_commands stream $stream_commands vectorset $vset_commands array $array_commands] +if 0 { + set gcra_commands {GCRA} + dict set commands gcra $gcra_commands +} set cmds [dict get $commands $type] set start_time [clock seconds] diff --git a/tests/unit/gcra.tcl b/tests/unit/gcra.tcl index 1080e76f7..1721334cb 100644 --- a/tests/unit/gcra.tcl +++ b/tests/unit/gcra.tcl @@ -1,4 +1,5 @@ start_server {tags {"gcra" "external:skip"}} { +if 0 { test {GCRA - argument validation} { # Wrong number of arguments (too few) catch {r gcra} err @@ -236,8 +237,10 @@ start_server {tags {"gcra" "external:skip"}} { assert {[r pttl mykey] > 0} } } +} start_server {tags {"gcra" "external:skip"}} { +if 0 { test {GCRA - RDB save and reload preserves value} { r del mykey r gcra mykey 5 1 60 @@ -333,8 +336,10 @@ start_server {tags {"gcra" "external:skip"}} { assert_equal $digest_before $digest_after } {} {needs:debug} } +} start_server {tags {"gcra repl" "external:skip"}} { +if 0 { set replica [srv 0 client] set replica_host [srv 0 host] set replica_port [srv 0 port] @@ -368,3 +373,4 @@ start_server {tags {"gcra repl" "external:skip"}} { } {} {external:skip} } } +} diff --git a/utils/generate-command-code.py b/utils/generate-command-code.py index b2137f1ae..fcd676df5 100755 --- a/utils/generate-command-code.py +++ b/utils/generate-command-code.py @@ -606,7 +606,9 @@ const char *COMMAND_GROUP_STR[] = { "bitmap", "array", "module", +#ifdef ENABLE_GCRA "rate_limit" +#endif }; const char *commandGroupStr(int index) { From 6c3a8ecceff085835a5388e97af0238646755bfe Mon Sep 17 00:00:00 2001 From: YaacovHazan <31382944+YaacovHazan@users.noreply.github.com> Date: Thu, 14 May 2026 17:14:15 +0300 Subject: [PATCH 18/19] Set default for INLINE_LSE_ATOMICS to 0 for compatibility across architectures (#15212) Ensure backward compatibility and consistent behavior across different architectures by explicitly setting the default value. Fixes #15175 Co-authored-by: ofiryanai --- modules/redisearch/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/redisearch/Makefile b/modules/redisearch/Makefile index c2cc409be..a56e9fc70 100644 --- a/modules/redisearch/Makefile +++ b/modules/redisearch/Makefile @@ -7,5 +7,10 @@ TARGET_MODULE = $(SRC_DIR)/bin/$(FULL_VARIANT)/search-community/redisearch.so LTO ?= 1 export LTO + # Set INLINE_LSE_ATOMICS=1 for perf improvement on common ARM CPUs (i.e. Graviton2/3/4); no effect on x86 or macOS. + # Default 0 keeps the binary runnable on pre-Armv8.1-a cores (Cortex-A72, Graviton1, RPi4) that would otherwise SIGILL at module load. +INLINE_LSE_ATOMICS ?= 0 +export INLINE_LSE_ATOMICS + include ../common.mk From 0c1c747062d5a31170d623fc14847c033f7479e5 Mon Sep 17 00:00:00 2001 From: stav-nachmias <99652558+stav-nachmias@users.noreply.github.com> Date: Thu, 14 May 2026 20:59:17 +0300 Subject: [PATCH 19/19] RED-196433 remove unused post release workflow (#15206) --- .github/workflows/post-release-automation.yml | 165 ------------------ 1 file changed, 165 deletions(-) delete mode 100644 .github/workflows/post-release-automation.yml diff --git a/.github/workflows/post-release-automation.yml b/.github/workflows/post-release-automation.yml deleted file mode 100644 index e9e756b4b..000000000 --- a/.github/workflows/post-release-automation.yml +++ /dev/null @@ -1,165 +0,0 @@ -name: Post-Release Automation - -on: - release: - types: [published] - -jobs: - extract-release-info: - if: github.repository == 'redis/redis' - runs-on: ubuntu-latest - outputs: - tag_name: ${{ steps.release-info.outputs.tag_name }} - release_type: ${{ steps.release-info.outputs.release_type }} - steps: - - name: Checkout repository - uses: actions/checkout@v6 - - - name: Extract and validate release information - id: release-info - env: - TAG_NAME: ${{ github.event.release.tag_name }} - GH_TOKEN: ${{ github.token }} - run: | - echo "tag_name=${TAG_NAME}" >> $GITHUB_OUTPUT - echo "Release tag: ${TAG_NAME}" - - LATEST_TAG=$(gh release view --json tagName --jq '.tagName') - echo "Latest release tag(from gh release): ${LATEST_TAG}" - - if [[ "${TAG_NAME}" == "${LATEST_TAG}" ]]; then - echo "release_type=latest" >> $GITHUB_OUTPUT - echo "Detected latest release: ${TAG_NAME}" - else - echo "release_type=non-latest" >> $GITHUB_OUTPUT - echo "Detected non-latest release: ${TAG_NAME} (latest is ${LATEST_TAG})" - fi - - create-tarball: - needs: extract-release-info - runs-on: ubuntu-latest - env: - TAG_NAME: ${{ needs.extract-release-info.outputs.tag_name }} - outputs: - sha256: ${{ steps.checksum.outputs.sha256 }} - size_mb: ${{ steps.size.outputs.size_mb }} - size_warning: ${{ steps.size.outputs.size_warning }} - steps: - - name: Checkout repository - uses: actions/checkout@v6 - with: - ref: ${{ env.TAG_NAME }} - fetch-depth: 0 - - - name: Create tarball - run: ./utils/releasetools/01_create_tarball.sh "$TAG_NAME" - - - name: Verify tarball size - id: size - run: | - TARBALL="/tmp/redis-${TAG_NAME}.tar.gz" - SIZE_MB=$(du -m "$TARBALL" | cut -f1) - echo "Tarball size: ${SIZE_MB} MB" - echo "size_mb=${SIZE_MB}" >> $GITHUB_OUTPUT - if [ "$SIZE_MB" -lt 3 ] || [ "$SIZE_MB" -gt 5 ]; then - echo "::warning::Tarball size ${SIZE_MB} MB is outside expected range (3-5 MB)" - echo "size_warning=true" >> $GITHUB_OUTPUT - else - echo "size_warning=false" >> $GITHUB_OUTPUT - fi - - - name: Calculate SHA256 checksum - id: checksum - run: | - TARBALL="/tmp/redis-${TAG_NAME}.tar.gz" - SHA256=$(shasum -a 256 "$TARBALL" | cut -d' ' -f1) - echo "SHA256: $SHA256" - echo "sha256=$SHA256" >> $GITHUB_OUTPUT - - - name: Upload tarball as artifact - uses: actions/upload-artifact@v6 - with: - name: redis-${{ env.TAG_NAME }}-tarball - path: /tmp/redis-${{ env.TAG_NAME }}.tar.gz - compression-level: 0 - - # approval-gate: - # needs: [extract-release-info, create-tarball] - # if: needs.extract-release-info.outputs.release_type == 'latest' - # runs-on: ubuntu-latest - # steps: - # - name: Approval gate - # run: | - # echo "Latest release detected. Manual approval required for production deployment." - # # TODO: Implement approval workflow - # # This could use GitHub Environments with required reviewers - # # or a manual approval step - - # upload-tarball: - # needs: [extract-release-info, create-tarball, approval-gate] - # if: always() && !cancelled() && needs.create-tarball.result == 'success' && (needs.approval-gate.result == 'success' || needs.approval-gate.result == 'skipped') - # runs-on: ubuntu-latest - # steps: - # - name: Upload tarball - # run: | - # echo "TODO: Implement tarball upload" - # # This will require: - # # - SSH credentials/keys for upload to download.redis.io - # # - Adaptation of utils/releasetools/02_upload_tarball.sh for CI environment - - # test-release-tarball: - # needs: upload-tarball - # runs-on: ubuntu-latest - # steps: - # - name: Test release tarball - # run: | - # echo "TODO: Implement release testing using utils/releasetools/03_test_release.sh" - # # This will: - # # - Download the uploaded tarball - # # - Extract and build Redis - - # update-release-hashes: - # needs: test-release-tarball - # runs-on: ubuntu-latest - # steps: - # - name: Update release hashes - # run: | - # echo "TODO: Implement hash update using utils/releasetools/04_release_hash.sh" - # # This will require: - # # - Access to redis-hashes repository - # # - Git credentials for committing and pushing - - summary-and-notify: - needs: [extract-release-info, create-tarball] # update-release-hashes - if: always() && github.repository == 'redis/redis' - runs-on: ubuntu-latest - env: - TAG_NAME: ${{ needs.extract-release-info.outputs.tag_name }} - RELEASE_TYPE: ${{ needs.extract-release-info.outputs.release_type }} - SHA256: ${{ needs.create-tarball.outputs.sha256 }} - SIZE_MB: ${{ needs.create-tarball.outputs.size_mb }} - SIZE_WARNING: ${{ needs.create-tarball.outputs.size_warning }} - steps: - - name: Summary - run: | - { - echo "## Post-Release Automation Summary" - echo "" - echo "- **Release Tag:** ${TAG_NAME}" - echo "- **Release Type:** ${RELEASE_TYPE}" - echo "- **Tarball SHA256:** ${SHA256}" - echo "- **Tarball Size:** ${SIZE_MB} MB" - if [ "${SIZE_WARNING}" == "true" ]; then - echo "" - echo "> [!WARNING]" - echo "> Tarball size is outside expected range, check the logs for details." - fi - } >> $GITHUB_STEP_SUMMARY - - # - name: Send Slack notification - # run: | - # echo "TODO: Implement Slack notification" - # # This will require: - # # - Slack webhook URL or bot token (stored in secrets) - # # - Determine appropriate channel (e.g., #releases, #redis-releases) - # # - Craft message with release information and workflow status