mirror of
https://github.com/opnsense/src.git
synced 2026-05-28 04:12:45 -04:00
8-Bit character support.
Old locate(1) programs still works with the new database format, print some garbage for 8 bit characters, but don't core (maybe except char 30). 7-Bit Puritan should not notice any difference. Same speed, Same database size if the database contain only ASCII characters. Reviewed by: ache
This commit is contained in:
parent
8f3a9a1b78
commit
139764e8e9
6 changed files with 150 additions and 70 deletions
|
|
@ -1,4 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
|
||||
* Copyright (c) 1989, 1993
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
|
|
@ -33,7 +34,7 @@
|
|||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: locate.bigram.c,v 1.1 1996/09/13 13:23:48 wosch Exp wosch $
|
||||
* $Id: locate.bigram.c,v 1.7 1996/09/14 20:15:49 wosch Exp $
|
||||
*/
|
||||
|
||||
#ifndef lint
|
||||
|
|
@ -60,7 +61,7 @@ static char sccsid[] = "@(#)locate.bigram.c 8.1 (Berkeley) 6/6/93";
|
|||
|
||||
u_char buf1[MAXPATHLEN] = " ";
|
||||
u_char buf2[MAXPATHLEN];
|
||||
u_int bigram[UCHAR_MAX][UCHAR_MAX];
|
||||
u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1];
|
||||
|
||||
int
|
||||
main(void)
|
||||
|
|
@ -84,7 +85,7 @@ main(void)
|
|||
break;
|
||||
|
||||
while (*cp != '\0' && *(cp + 1) != '\0') {
|
||||
bigram[(u_int)*cp][(u_int)*(cp + 1)]++;
|
||||
bigram[(u_char)*cp][(u_char)*(cp + 1)]++;
|
||||
cp += 2;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
|
||||
* Copyright (c) 1989, 1993
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
|
|
@ -33,7 +34,7 @@
|
|||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: locate.code.c,v 1.4 1996/08/22 18:46:13 wosch Exp $
|
||||
* $Id: locate.code.c,v 1.5 1996/08/31 14:51:18 wosch Exp $
|
||||
*/
|
||||
|
||||
#ifndef lint
|
||||
|
|
@ -72,13 +73,22 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93";
|
|||
*
|
||||
* 0-28 likeliest differential counts + offset to make nonnegative
|
||||
* 30 switch code for out-of-range count to follow in next word
|
||||
* 31 an 8 bit char followed
|
||||
* 128-255 bigram codes (128 most common, as determined by 'updatedb')
|
||||
* 32-127 single character (printable) ascii residue (ie, literal)
|
||||
*
|
||||
* SEE ALSO: updatedb.csh, bigram.c
|
||||
* The locate database store any character except newline ('\n')
|
||||
* and NUL ('\0'). The 8-bit character support don't wast extra
|
||||
* space until you have characters in file names less than 32
|
||||
* or greather than 127.
|
||||
*
|
||||
*
|
||||
* SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c
|
||||
*
|
||||
* AUTHOR: James A. Woods, Informatics General Corp.,
|
||||
* NASA Ames Research Center, 10/82
|
||||
* 8-bit file names characters:
|
||||
* Wolfram Schneider, Berlin September 1996
|
||||
*/
|
||||
|
||||
#include <sys/param.h>
|
||||
|
|
@ -93,14 +103,14 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93";
|
|||
|
||||
u_char buf1[MAXPATHLEN] = " ";
|
||||
u_char buf2[MAXPATHLEN];
|
||||
char bigrams[BGBUFSIZE + 1] = { 0 };
|
||||
u_char bigrams[BGBUFSIZE + 1] = { 0 };
|
||||
|
||||
#define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
|
||||
|
||||
#ifdef LOOKUP
|
||||
#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)])
|
||||
typedef u_char bg_t;
|
||||
bg_t big[UCHAR_MAX][UCHAR_MAX];
|
||||
#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)])
|
||||
typedef short bg_t;
|
||||
bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1];
|
||||
#else
|
||||
#define BGINDEX(x) bgindex(x)
|
||||
typedef int bg_t;
|
||||
|
|
@ -145,12 +155,13 @@ main(argc, argv)
|
|||
|
||||
#ifdef LOOKUP
|
||||
/* init lookup table */
|
||||
for (i = 0; i < UCHAR_MAX; i++)
|
||||
for (j = 0; j < UCHAR_MAX; j++)
|
||||
for (i = 0; i < UCHAR_MAX + 1; i++)
|
||||
for (j = 0; j < UCHAR_MAX + 1; j++)
|
||||
big[i][j] = (bg_t)-1;
|
||||
|
||||
for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2)
|
||||
big[(int)*cp][(int)*(cp + 1)] = (bg_t)i;
|
||||
big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i;
|
||||
|
||||
#endif /* LOOKUP */
|
||||
|
||||
oldpath = buf1;
|
||||
|
|
@ -159,22 +170,21 @@ main(argc, argv)
|
|||
|
||||
while (fgets(path, sizeof(buf2), stdin) != NULL) {
|
||||
|
||||
/* skip empty lines */
|
||||
/* skip empty lines */
|
||||
if (*path == '\n')
|
||||
continue;
|
||||
|
||||
/* Squelch characters that would botch the decoding. */
|
||||
/* remove newline */
|
||||
for (cp = path; *cp != '\0'; cp++) {
|
||||
/* chop newline */
|
||||
if (*cp == '\n')
|
||||
*cp = '\0';
|
||||
/* range */
|
||||
else if (*cp < ASCII_MIN || *cp > ASCII_MAX)
|
||||
*cp = '?';
|
||||
}
|
||||
|
||||
/* Skip longest common prefix. */
|
||||
for (cp = path; *cp == *oldpath && *cp != '\0'; cp++, oldpath++);
|
||||
for (cp = path; *cp == *oldpath; cp++, oldpath++)
|
||||
if (*cp == '\0')
|
||||
break;
|
||||
|
||||
count = cp - path;
|
||||
diffcount = count - oldcount + OFFSET;
|
||||
|
|
@ -188,22 +198,42 @@ main(argc, argv)
|
|||
err(1, "stdout");
|
||||
|
||||
while (*cp != '\0') {
|
||||
if (*(cp + 1) == '\0') {
|
||||
if (putchar(*cp) == EOF)
|
||||
err(1, "stdout");
|
||||
break;
|
||||
}
|
||||
if ((code = BGINDEX(cp)) == (bg_t)-1) {
|
||||
if (putchar(*cp++) == EOF ||
|
||||
putchar(*cp++) == EOF)
|
||||
err(1, "stdout");
|
||||
} else {
|
||||
/* Found, so mark byte with parity bit. */
|
||||
/* print *two* characters */
|
||||
|
||||
if ((code = BGINDEX(cp)) != (bg_t)-1) {
|
||||
/*
|
||||
* print *one* as bigram
|
||||
* Found, so mark byte with
|
||||
* parity bit.
|
||||
*/
|
||||
if (putchar((code / 2) | PARITY) == EOF)
|
||||
err(1, "stdout");
|
||||
cp += 2;
|
||||
}
|
||||
|
||||
else {
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (*cp == '\0')
|
||||
break;
|
||||
|
||||
/* print umlauts in file names */
|
||||
if (*cp < ASCII_MIN ||
|
||||
*cp > ASCII_MAX) {
|
||||
if (putchar(UMLAUT) == EOF ||
|
||||
putchar(*cp++) == EOF)
|
||||
err(1, "stdout");
|
||||
}
|
||||
|
||||
else {
|
||||
/* normal character */
|
||||
if(putchar(*cp++) == EOF)
|
||||
err(1, "stdout");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (path == buf1) { /* swap pointers */
|
||||
path = buf2;
|
||||
oldpath = buf1;
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@
|
|||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: fastfind.c,v 1.1 1996/08/31 23:14:52 wosch Exp $
|
||||
* $Id: fastfind.c,v 1.2 1996/10/09 00:33:32 wosch Exp $
|
||||
*/
|
||||
|
||||
|
||||
|
|
@ -46,10 +46,10 @@ statistic (fp, path_fcodes)
|
|||
FILE *fp; /* open database */
|
||||
char *path_fcodes; /* for error message */
|
||||
{
|
||||
register int lines, chars, size, big;
|
||||
register int lines, chars, size, big, zwerg;
|
||||
register u_char *p, *s;
|
||||
register int c;
|
||||
int count;
|
||||
int count, umlaut;
|
||||
u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN];
|
||||
|
||||
for (c = 0, p = bigram1, s = bigram2; c < NBG; c++) {
|
||||
|
|
@ -57,20 +57,27 @@ statistic (fp, path_fcodes)
|
|||
s[c] = check_bigram_char(getc(fp));
|
||||
}
|
||||
|
||||
lines = chars = big = 0;
|
||||
lines = chars = big = zwerg = umlaut = 0;
|
||||
size = NBG + NBG;
|
||||
|
||||
for (c = getc(fp), count = 0; c != EOF; size++) {
|
||||
if (c == SWITCH) {
|
||||
count += getwf(fp) - OFFSET;
|
||||
size += sizeof(int);
|
||||
zwerg++;
|
||||
} else
|
||||
count += c - OFFSET;
|
||||
|
||||
for (p = path + count; (c = getc(fp)) > SWITCH; size++)
|
||||
if (c < PARITY)
|
||||
if (c < PARITY) {
|
||||
if (c == UMLAUT) {
|
||||
c = getc(fp);
|
||||
size++;
|
||||
umlaut++;
|
||||
}
|
||||
p++;
|
||||
else {
|
||||
} else {
|
||||
/* bigram char */
|
||||
big++;
|
||||
p += 2;
|
||||
}
|
||||
|
|
@ -82,13 +89,16 @@ statistic (fp, path_fcodes)
|
|||
|
||||
(void)printf("\nDatabase: %s\n", path_fcodes);
|
||||
(void)printf("Compression: Front: %2.2f%%, ",
|
||||
(float)(100 * (size + big)) / chars);
|
||||
(float)(100 * (size + big - (2 * NBG))) / chars);
|
||||
(void)printf("Bigram: %2.2f%%, ", (float)(100 * (size - big)) / size);
|
||||
(void)printf("Total: %2.2f%%\n", (float)(100 * size) / chars);
|
||||
(void)printf("Total: %2.2f%%\n",
|
||||
(float)(100 * (size - (2 * NBG))) / chars);
|
||||
(void)printf("Filenames: %d, ", lines);
|
||||
(void)printf("Chars: %d\n", chars);
|
||||
(void)printf("Database size: %d, ", size);
|
||||
(void)printf("Bigram chars: %d\n", big);
|
||||
(void)printf("Characters: %d, ", chars);
|
||||
(void)printf("Database size: %d\n", size);
|
||||
(void)printf("Bigram characters: %d, ", big);
|
||||
(void)printf("Integers: %d, ", zwerg);
|
||||
(void)printf("8-Bit characters: %d\n", umlaut);
|
||||
|
||||
}
|
||||
#endif /* _LOCATE_STATISTIC_ */
|
||||
|
|
@ -102,7 +112,7 @@ void
|
|||
fastfind_mmap_icase
|
||||
#else
|
||||
fastfind_mmap
|
||||
#endif
|
||||
#endif /* FF_ICASE */
|
||||
(pathpart, paddr, len, database)
|
||||
char *pathpart; /* search string */
|
||||
caddr_t paddr; /* mmap pointer */
|
||||
|
|
@ -115,7 +125,7 @@ fastfind_mmap
|
|||
|
||||
#ifdef FF_ICASE
|
||||
fastfind_icase
|
||||
#else /* !FF_ICASE */
|
||||
#else
|
||||
fastfind
|
||||
#endif /* FF_ICASE */
|
||||
|
||||
|
|
@ -136,10 +146,10 @@ fastfind
|
|||
|
||||
#ifdef FF_ICASE
|
||||
/* use a lookup table for case insensitive search */
|
||||
u_char table[UCHAR_MAX];
|
||||
u_char table[UCHAR_MAX + 1];
|
||||
|
||||
tolower_word(pathpart);
|
||||
#endif
|
||||
#endif /* FF_ICASE*/
|
||||
|
||||
/* init bigram table */
|
||||
#ifdef FF_MMAP
|
||||
|
|
@ -157,7 +167,7 @@ fastfind
|
|||
p[c] = check_bigram_char(getc(fp));
|
||||
s[c] = check_bigram_char(getc(fp));
|
||||
}
|
||||
#endif
|
||||
#endif /* FF_MMAP */
|
||||
|
||||
/* find optimal (last) char for searching */
|
||||
for (p = pathpart; *p != '\0'; p++)
|
||||
|
|
@ -177,7 +187,7 @@ fastfind
|
|||
/* set patend char to true */
|
||||
table[TOLOWER(*patend)] = 1;
|
||||
table[toupper(*patend)] = 1;
|
||||
#endif
|
||||
#endif /* FF_ICASE */
|
||||
|
||||
|
||||
/* main loop */
|
||||
|
|
@ -185,10 +195,12 @@ fastfind
|
|||
foundchar = 0;
|
||||
|
||||
#ifdef FF_MMAP
|
||||
for (c = (u_char)*paddr++; len-- > 0; ) {
|
||||
c = (u_char)*paddr++; len--;
|
||||
for (; len > 0; ) {
|
||||
#else
|
||||
for (c = getc(fp); c != EOF; ) {
|
||||
#endif
|
||||
c = getc(fp);
|
||||
for (; c != EOF; ) {
|
||||
#endif /* FF_MMAP */
|
||||
|
||||
/* go forward or backward */
|
||||
if (c == SWITCH) { /* big step, an integer */
|
||||
|
|
@ -197,7 +209,7 @@ fastfind
|
|||
len -= INTSIZE; paddr += INTSIZE;
|
||||
#else
|
||||
count += getwf(fp) - OFFSET;
|
||||
#endif
|
||||
#endif /* FF_MMAP */
|
||||
} else { /* slow step, =< 14 chars */
|
||||
count += c - OFFSET;
|
||||
}
|
||||
|
|
@ -205,18 +217,40 @@ fastfind
|
|||
/* overlay old path */
|
||||
p = path + count;
|
||||
foundchar = p - 1;
|
||||
#ifdef FF_MMAP
|
||||
for (; (c = (u_char)*paddr++) > SWITCH; len--)
|
||||
#else
|
||||
for (; (c = getc(fp)) > SWITCH; )
|
||||
#endif
|
||||
|
||||
for (;;) {
|
||||
#ifdef FF_MMAP
|
||||
c = (u_char)*paddr++;
|
||||
len--;
|
||||
#else
|
||||
c = getc(fp);
|
||||
#endif /* FF_MMAP */
|
||||
/*
|
||||
* == UMLAUT: 8 bit char followed
|
||||
* <= SWITCH: offset
|
||||
* >= PARITY: bigram
|
||||
* rest: single ascii char
|
||||
*
|
||||
* offset < SWITCH < UMLAUT < ascii < PARITY < bigram
|
||||
*/
|
||||
if (c < PARITY) {
|
||||
if (c <= UMLAUT) {
|
||||
if (c == UMLAUT) {
|
||||
#ifdef FF_MMAP
|
||||
c = (u_char)*paddr++;
|
||||
len--;
|
||||
#else
|
||||
c = getc(fp);
|
||||
#endif /* FF_MMAP */
|
||||
|
||||
} else
|
||||
break; /* SWITCH */
|
||||
}
|
||||
#ifdef FF_ICASE
|
||||
if (table[c])
|
||||
#else
|
||||
if (c == cc)
|
||||
#endif
|
||||
#endif /* FF_ICASE */
|
||||
foundchar = p;
|
||||
*p++ = c;
|
||||
}
|
||||
|
|
@ -231,13 +265,13 @@ fastfind
|
|||
|
||||
if (table[bigram1[c]] ||
|
||||
table[bigram2[c]])
|
||||
#endif
|
||||
#endif /* FF_ICASE */
|
||||
foundchar = p + 1;
|
||||
|
||||
*p++ = bigram1[c];
|
||||
*p++ = bigram2[c];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (found) { /* previous line matched */
|
||||
cutoff = path;
|
||||
|
|
@ -254,14 +288,14 @@ fastfind
|
|||
if (*s == cc
|
||||
#ifdef FF_ICASE
|
||||
|| TOLOWER(*s) == cc
|
||||
#endif
|
||||
#endif /* FF_ICASE */
|
||||
) { /* fast first char check */
|
||||
for (p = patend - 1, q = s - 1; *p != '\0';
|
||||
p--, q--)
|
||||
if (*q != *p
|
||||
#ifdef FF_ICASE
|
||||
&& TOLOWER(*q) != *p
|
||||
#endif
|
||||
#endif /* FF_ICASE */
|
||||
)
|
||||
break;
|
||||
if (*p == '\0') { /* fast match success */
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@
|
|||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.\" @(#)locate.1 8.1 (Berkeley) 6/6/93
|
||||
.\" $Id$
|
||||
.\" $Id: locate.1,v 1.4 1996/08/31 23:14:52 wosch Exp $
|
||||
.\"
|
||||
.Dd June 6, 1993
|
||||
.Dt LOCATE 1
|
||||
|
|
@ -66,6 +66,12 @@ including slashes (``/'').
|
|||
As a special case, a pattern containing no globbing characters (``foo'')
|
||||
is matched as though it were ``*foo*''.
|
||||
|
||||
Historically, locate store only characters between 32 and 127. The
|
||||
current implementation store any character except newline ('\\n') and
|
||||
NUL ('\\0'). The 8-bit character support don't wast extra space for
|
||||
plain ASCII file names. Characters less than 32 or greather than 127
|
||||
are stored in 2 bytes.
|
||||
|
||||
The following options are available:
|
||||
.Bl -tag -width 10n indent
|
||||
.It Fl S
|
||||
|
|
@ -200,7 +206,8 @@ to share the databases between machines with different byte order.
|
|||
The current
|
||||
.Nm
|
||||
implementation understand databases in host byte order or
|
||||
network byte order. So you can read on a FreeBSD/i386 machine
|
||||
network byte order if both architectures use the same integer size.
|
||||
So you can read on a FreeBSD/i386 machine
|
||||
(little endian)
|
||||
a locate database which was built on SunOS/sparc machine
|
||||
(big endian, net).
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@
|
|||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: locate.c,v 1.4 1996/08/31 23:14:53 wosch Exp $
|
||||
* $Id: locate.c,v 1.5 1996/09/16 01:17:25 wosch Exp $
|
||||
*/
|
||||
|
||||
#ifndef lint
|
||||
|
|
@ -60,6 +60,7 @@ static char sccsid[] = "@(#)locate.c 8.1 (Berkeley) 6/6/93";
|
|||
*
|
||||
* 0-28 likeliest differential counts + offset to make nonnegative
|
||||
* 30 switch code for out-of-range count to follow in next word
|
||||
* 31 an 8 bit char followed
|
||||
* 128-255 bigram codes (128 most common, as determined by 'updatedb')
|
||||
* 32-127 single character (printable) ascii residue (ie, literal)
|
||||
*
|
||||
|
|
@ -76,19 +77,22 @@ static char sccsid[] = "@(#)locate.c 8.1 (Berkeley) 6/6/93";
|
|||
*/
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <fnmatch.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <err.h>
|
||||
#include <fnmatch.h>
|
||||
#include <locale.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef MMAP
|
||||
# include <sys/types.h>
|
||||
# include <sys/stat.h>
|
||||
# include <sys/mman.h>
|
||||
# include <fcntl.h>
|
||||
#endif
|
||||
#include <err.h>
|
||||
|
||||
|
||||
#ifdef sun
|
||||
#include <netinet/in.h> /* SunOS byteorder(3) htohl(3) */
|
||||
|
|
@ -148,6 +152,7 @@ main(argc, argv)
|
|||
#ifdef MMAP
|
||||
f_mmap = 1; /* mmap is default */
|
||||
#endif
|
||||
(void) setlocale(LC_ALL, "");
|
||||
|
||||
while ((ch = getopt(argc, argv, "Scd:il:ms")) != EOF)
|
||||
switch(ch) {
|
||||
|
|
@ -198,7 +203,7 @@ main(argc, argv)
|
|||
}
|
||||
|
||||
if (f_icase && UCHAR_MAX < 4096) /* init tolower lookup table */
|
||||
for (ch = 0; ch <= UCHAR_MAX; ch++)
|
||||
for (ch = 0; ch < UCHAR_MAX + 1; ch++)
|
||||
myctype[ch] = tolower(ch);
|
||||
|
||||
/* foreach database ... */
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
|
||||
* Copyright (c) 1989, 1993
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
|
|
@ -31,6 +32,7 @@
|
|||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)locate.h 8.1 (Berkeley) 6/6/93
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
/* Symbolic constants shared by locate.c and code.c */
|
||||
|
|
@ -39,6 +41,7 @@
|
|||
#define OFFSET 14 /* abs value of max likely diff */
|
||||
#define PARITY 0200 /* parity bit */
|
||||
#define SWITCH 30 /* switch code */
|
||||
#define UMLAUT 31 /* an 8 bit char followed */
|
||||
|
||||
/* 0-28 likeliest differential counts + offset to make nonnegative */
|
||||
#define LDC_MIN 0
|
||||
|
|
|
|||
Loading…
Reference in a new issue