2012-05-11 08:37:16 -04:00
|
|
|
/*-
|
2023-05-10 11:40:58 -04:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2017-11-27 10:37:16 -05:00
|
|
|
*
|
2012-05-11 08:37:16 -04:00
|
|
|
* Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org>
|
2013-06-02 05:43:48 -04:00
|
|
|
* Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
|
2012-05-11 08:37:16 -04:00
|
|
|
* All rights reserved.
|
|
|
|
|
*
|
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
|
* are met:
|
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
|
*
|
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
|
#include <ctype.h>
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
#include <err.h>
|
|
|
|
|
#include <langinfo.h>
|
|
|
|
|
#include <math.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <wchar.h>
|
|
|
|
|
#include <wctype.h>
|
|
|
|
|
|
|
|
|
|
#include "bwstring.h"
|
|
|
|
|
#include "sort.h"
|
|
|
|
|
|
2012-05-14 06:06:49 -04:00
|
|
|
bool byte_sort;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
struct wmonth {
|
|
|
|
|
wchar_t *mon;
|
|
|
|
|
wchar_t *ab;
|
|
|
|
|
wchar_t *alt;
|
|
|
|
|
};
|
2012-05-11 08:37:16 -04:00
|
|
|
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
struct cmonth {
|
|
|
|
|
char *mon;
|
|
|
|
|
char *ab;
|
|
|
|
|
char *alt;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static struct wmonth *wmonths;
|
|
|
|
|
static struct cmonth *cmonths;
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
populate_cmonth(char **field, const nl_item item, int idx)
|
|
|
|
|
{
|
|
|
|
|
char *tmp, *m;
|
|
|
|
|
size_t i, len;
|
|
|
|
|
|
|
|
|
|
tmp = nl_langinfo(item);
|
|
|
|
|
if (debug_sort)
|
|
|
|
|
printf("month[%d]=%s\n", idx, tmp);
|
|
|
|
|
if (*tmp == '\0')
|
|
|
|
|
return (0);
|
|
|
|
|
m = sort_strdup(tmp);
|
|
|
|
|
len = strlen(tmp);
|
|
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
|
m[i] = toupper(m[i]);
|
|
|
|
|
*field = m;
|
|
|
|
|
|
|
|
|
|
return (1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
populate_wmonth(wchar_t **field, const nl_item item, int idx)
|
|
|
|
|
{
|
|
|
|
|
wchar_t *m;
|
|
|
|
|
char *tmp;
|
|
|
|
|
size_t i, len;
|
|
|
|
|
|
|
|
|
|
tmp = nl_langinfo(item);
|
|
|
|
|
if (debug_sort)
|
|
|
|
|
printf("month[%d]=%s\n", idx, tmp);
|
|
|
|
|
if (*tmp == '\0')
|
|
|
|
|
return (0);
|
|
|
|
|
len = strlen(tmp);
|
|
|
|
|
m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1));
|
|
|
|
|
if (mbstowcs(m, tmp, len) == ((size_t) - 1)) {
|
|
|
|
|
sort_free(m);
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
m[len] = L'\0';
|
|
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
|
m[i] = towupper(m[i]);
|
|
|
|
|
*field = m;
|
|
|
|
|
|
|
|
|
|
return (1);
|
|
|
|
|
}
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
void
|
|
|
|
|
initialise_months(void)
|
|
|
|
|
{
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
const nl_item mon_item[12] = { MON_1, MON_2, MON_3, MON_4,
|
|
|
|
|
MON_5, MON_6, MON_7, MON_8, MON_9, MON_10,
|
|
|
|
|
MON_11, MON_12 };
|
|
|
|
|
const nl_item ab_item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4,
|
2012-05-11 08:37:16 -04:00
|
|
|
ABMON_5, ABMON_6, ABMON_7, ABMON_8, ABMON_9, ABMON_10,
|
|
|
|
|
ABMON_11, ABMON_12 };
|
2023-12-07 15:27:07 -05:00
|
|
|
#ifdef ALTMON_1
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
const nl_item alt_item[12] = { ALTMON_1, ALTMON_2, ALTMON_3, ALTMON_4,
|
|
|
|
|
ALTMON_5, ALTMON_6, ALTMON_7, ALTMON_8, ALTMON_9, ALTMON_10,
|
|
|
|
|
ALTMON_11, ALTMON_12 };
|
2023-12-07 15:27:07 -05:00
|
|
|
#endif
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Handle all possible month formats: abbrevation, full name,
|
|
|
|
|
* standalone name (without case ending).
|
|
|
|
|
*/
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2012-05-11 08:37:16 -04:00
|
|
|
if (cmonths == NULL) {
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
cmonths = sort_malloc(sizeof(struct cmonth) * 12);
|
|
|
|
|
for (i = 0; i < 12; i++) {
|
|
|
|
|
if (!populate_cmonth(&cmonths[i].mon,
|
|
|
|
|
mon_item[i], i))
|
|
|
|
|
continue;
|
|
|
|
|
if (!populate_cmonth(&cmonths[i].ab,
|
|
|
|
|
ab_item[i], i))
|
|
|
|
|
continue;
|
2023-12-07 15:27:07 -05:00
|
|
|
#ifdef ALTMON_1
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
if (!populate_cmonth(&cmonths[i].alt,
|
|
|
|
|
alt_item[i], i))
|
2012-05-11 08:37:16 -04:00
|
|
|
continue;
|
2023-12-07 15:27:07 -05:00
|
|
|
#else
|
|
|
|
|
cmonths[i].alt = NULL;
|
|
|
|
|
#endif
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
if (wmonths == NULL) {
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
wmonths = sort_malloc(sizeof(struct wmonth) * 12);
|
|
|
|
|
for (i = 0; i < 12; i++) {
|
|
|
|
|
if (!populate_wmonth(&wmonths[i].mon,
|
|
|
|
|
mon_item[i], i))
|
2012-05-11 08:37:16 -04:00
|
|
|
continue;
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
if (!populate_wmonth(&wmonths[i].ab,
|
|
|
|
|
ab_item[i], i))
|
|
|
|
|
continue;
|
2023-12-07 15:27:07 -05:00
|
|
|
#ifdef ALTMON_1
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
if (!populate_wmonth(&wmonths[i].alt,
|
|
|
|
|
alt_item[i], i))
|
2012-05-11 08:37:16 -04:00
|
|
|
continue;
|
2023-12-07 15:27:07 -05:00
|
|
|
#else
|
|
|
|
|
wmonths[i].alt = NULL;
|
|
|
|
|
#endif
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Compare two wide-character strings
|
|
|
|
|
*/
|
|
|
|
|
static int
|
|
|
|
|
wide_str_coll(const wchar_t *s1, const wchar_t *s2)
|
|
|
|
|
{
|
2021-07-05 09:32:48 -04:00
|
|
|
int ret;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
errno = 0;
|
|
|
|
|
ret = wcscoll(s1, s2);
|
|
|
|
|
if (errno == EILSEQ) {
|
|
|
|
|
errno = 0;
|
|
|
|
|
ret = wcscmp(s1, s2);
|
|
|
|
|
if (errno != 0) {
|
|
|
|
|
for (size_t i = 0; ; ++i) {
|
|
|
|
|
wchar_t c1 = s1[i];
|
|
|
|
|
wchar_t c2 = s2[i];
|
|
|
|
|
if (c1 == L'\0')
|
|
|
|
|
return ((c2 == L'\0') ? 0 : -1);
|
|
|
|
|
if (c2 == L'\0')
|
|
|
|
|
return (+1);
|
|
|
|
|
if (c1 == c2)
|
|
|
|
|
continue;
|
|
|
|
|
return ((int)(c1 - c2));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return (ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* counterparts of wcs functions */
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
bwsprintf(FILE *f, struct bwstring *bws, const char *prefix, const char *suffix)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1)
|
2021-07-05 09:32:48 -04:00
|
|
|
fprintf(f, "%s%s%s", prefix, bws->cdata.str, suffix);
|
2012-05-11 08:37:16 -04:00
|
|
|
else
|
2021-07-05 09:32:48 -04:00
|
|
|
fprintf(f, "%s%S%s", prefix, bws->wdata.str, suffix);
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const void* bwsrawdata(const struct bwstring *bws)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-10-29 14:25:42 -04:00
|
|
|
return (bws->wdata.str);
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t bwsrawlen(const struct bwstring *bws)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
return ((mb_cur_max == 1) ? bws->cdata.len :
|
|
|
|
|
SIZEOF_WCHAR_STRING(bws->wdata.len));
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t
|
|
|
|
|
bws_memsize(const struct bwstring *bws)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
return ((mb_cur_max == 1) ?
|
|
|
|
|
(bws->cdata.len + 2 + sizeof(struct bwstring)) :
|
|
|
|
|
(SIZEOF_WCHAR_STRING(bws->wdata.len + 1) + sizeof(struct bwstring)));
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
bws_setlen(struct bwstring *bws, size_t newlen)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
if (mb_cur_max == 1 && bws && newlen != bws->cdata.len &&
|
|
|
|
|
newlen <= bws->cdata.len) {
|
|
|
|
|
bws->cdata.len = newlen;
|
|
|
|
|
bws->cdata.str[newlen] = '\0';
|
|
|
|
|
} else if (bws && newlen != bws->wdata.len && newlen <= bws->wdata.len) {
|
|
|
|
|
bws->wdata.len = newlen;
|
|
|
|
|
bws->wdata.str[newlen] = L'\0';
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Allocate a new binary string of specified size
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
bwsalloc(size_t sz)
|
|
|
|
|
{
|
|
|
|
|
struct bwstring *ret;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2012-05-11 08:37:16 -04:00
|
|
|
ret = sort_malloc(sizeof(struct bwstring) + 1 + sz);
|
2021-07-05 09:32:48 -04:00
|
|
|
ret->cdata.len = sz;
|
|
|
|
|
ret->cdata.str[sz] = '\0';
|
|
|
|
|
} else {
|
|
|
|
|
ret = sort_malloc(
|
|
|
|
|
sizeof(struct bwstring) + SIZEOF_WCHAR_STRING(sz + 1));
|
|
|
|
|
ret->wdata.len = sz;
|
|
|
|
|
ret->wdata.str[sz] = L'\0';
|
|
|
|
|
}
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Create a copy of binary string.
|
|
|
|
|
* New string size equals the length of the old string.
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
bwsdup(const struct bwstring *s)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2012-05-11 08:37:16 -04:00
|
|
|
if (s == NULL)
|
|
|
|
|
return (NULL);
|
|
|
|
|
else {
|
2021-07-05 09:32:48 -04:00
|
|
|
struct bwstring *ret = bwsalloc(BWSLEN(s));
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1)
|
2021-07-05 09:32:48 -04:00
|
|
|
memcpy(ret->cdata.str, s->cdata.str, (s->cdata.len));
|
2012-05-11 08:37:16 -04:00
|
|
|
else
|
2021-07-05 09:32:48 -04:00
|
|
|
memcpy(ret->wdata.str, s->wdata.str,
|
|
|
|
|
SIZEOF_WCHAR_STRING(s->wdata.len));
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2015-04-05 18:34:03 -04:00
|
|
|
* Create a new binary string from a wide character buffer.
|
2012-05-11 08:37:16 -04:00
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
bwssbdup(const wchar_t *str, size_t len)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2012-05-11 08:37:16 -04:00
|
|
|
if (str == NULL)
|
|
|
|
|
return ((len == 0) ? bwsalloc(0) : NULL);
|
|
|
|
|
else {
|
|
|
|
|
struct bwstring *ret;
|
|
|
|
|
|
|
|
|
|
ret = bwsalloc(len);
|
|
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1)
|
2012-05-11 08:37:16 -04:00
|
|
|
for (size_t i = 0; i < len; ++i)
|
2021-07-05 09:32:48 -04:00
|
|
|
ret->cdata.str[i] = (char)str[i];
|
2012-05-11 08:37:16 -04:00
|
|
|
else
|
2021-07-05 09:32:48 -04:00
|
|
|
memcpy(ret->wdata.str, str, SIZEOF_WCHAR_STRING(len));
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Create a new binary string from a raw binary buffer.
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
bwscsbdup(const unsigned char *str, size_t len)
|
|
|
|
|
{
|
|
|
|
|
struct bwstring *ret;
|
|
|
|
|
|
|
|
|
|
ret = bwsalloc(len);
|
|
|
|
|
|
|
|
|
|
if (str) {
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1)
|
2021-07-05 09:32:48 -04:00
|
|
|
memcpy(ret->cdata.str, str, len);
|
2012-05-11 08:37:16 -04:00
|
|
|
else {
|
|
|
|
|
mbstate_t mbs;
|
|
|
|
|
const char *s;
|
|
|
|
|
size_t charlen, chars, cptr;
|
|
|
|
|
|
2017-02-17 14:53:20 -05:00
|
|
|
chars = 0;
|
2012-05-11 08:37:16 -04:00
|
|
|
cptr = 0;
|
|
|
|
|
s = (const char *) str;
|
|
|
|
|
|
|
|
|
|
memset(&mbs, 0, sizeof(mbs));
|
|
|
|
|
|
|
|
|
|
while (cptr < len) {
|
2021-05-13 08:55:06 -04:00
|
|
|
size_t n = mb_cur_max;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (n > len - cptr)
|
|
|
|
|
n = len - cptr;
|
|
|
|
|
charlen = mbrlen(s + cptr, n, &mbs);
|
|
|
|
|
switch (charlen) {
|
|
|
|
|
case 0:
|
|
|
|
|
/* FALLTHROUGH */
|
|
|
|
|
case (size_t) -1:
|
|
|
|
|
/* FALLTHROUGH */
|
|
|
|
|
case (size_t) -2:
|
2021-07-05 09:32:48 -04:00
|
|
|
ret->wdata.str[chars++] =
|
2012-05-11 08:37:16 -04:00
|
|
|
(unsigned char) s[cptr];
|
|
|
|
|
++cptr;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2021-07-05 09:32:48 -04:00
|
|
|
n = mbrtowc(ret->wdata.str + (chars++),
|
2012-05-11 08:37:16 -04:00
|
|
|
s + cptr, charlen, &mbs);
|
|
|
|
|
if ((n == (size_t)-1) || (n == (size_t)-2))
|
|
|
|
|
/* NOTREACHED */
|
|
|
|
|
err(2, "mbrtowc error");
|
|
|
|
|
cptr += charlen;
|
2016-04-15 18:31:22 -04:00
|
|
|
}
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
ret->wdata.len = chars;
|
|
|
|
|
ret->wdata.str[ret->wdata.len] = L'\0';
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return (ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* De-allocate object memory
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
bwsfree(const struct bwstring *s)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2012-05-11 08:37:16 -04:00
|
|
|
if (s)
|
|
|
|
|
sort_free(s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy content of src binary string to dst,
|
|
|
|
|
* with specified number of symbols to be copied.
|
|
|
|
|
* An offset value can be specified, from the start of src string.
|
|
|
|
|
* If the capacity of the dst string is not sufficient,
|
|
|
|
|
* then the data is truncated.
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
bwsnocpy(struct bwstring *dst, const struct bwstring *src, size_t offset,
|
|
|
|
|
size_t size)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
if (offset >= BWSLEN(src)) {
|
|
|
|
|
bws_setlen(dst, 0);
|
2012-05-11 08:37:16 -04:00
|
|
|
} else {
|
2021-07-05 09:32:48 -04:00
|
|
|
size_t nums = BWSLEN(src) - offset;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
if (nums > BWSLEN(dst))
|
|
|
|
|
nums = BWSLEN(dst);
|
2012-05-11 08:37:16 -04:00
|
|
|
if (nums > size)
|
|
|
|
|
nums = size;
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
memcpy(dst->cdata.str, src->cdata.str + offset, nums);
|
|
|
|
|
dst->cdata.len = nums;
|
|
|
|
|
dst->cdata.str[nums] = '\0';
|
2012-05-11 08:37:16 -04:00
|
|
|
} else {
|
2021-07-05 09:32:48 -04:00
|
|
|
memcpy(dst->wdata.str, src->wdata.str + offset,
|
2012-05-11 08:37:16 -04:00
|
|
|
SIZEOF_WCHAR_STRING(nums));
|
2021-07-05 09:32:48 -04:00
|
|
|
dst->wdata.len = nums;
|
|
|
|
|
dst->wdata.str[nums] = L'\0';
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return (dst);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Write binary string to the file.
|
|
|
|
|
* The output is ended either with '\n' (nl == true)
|
|
|
|
|
* or '\0' (nl == false).
|
|
|
|
|
*/
|
2012-11-01 07:38:34 -04:00
|
|
|
size_t
|
2012-05-11 08:37:16 -04:00
|
|
|
bwsfwrite(struct bwstring *bws, FILE *f, bool zero_ended)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
size_t len = bws->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (!zero_ended) {
|
2021-07-05 09:32:48 -04:00
|
|
|
bws->cdata.str[len] = '\n';
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
if (fwrite(bws->cdata.str, len + 1, 1, f) < 1)
|
2012-05-11 08:37:16 -04:00
|
|
|
err(2, NULL);
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
bws->cdata.str[len] = '\0';
|
|
|
|
|
} else if (fwrite(bws->cdata.str, len + 1, 1, f) < 1)
|
2012-05-11 08:37:16 -04:00
|
|
|
err(2, NULL);
|
|
|
|
|
|
|
|
|
|
return (len + 1);
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
wchar_t eols;
|
2012-11-01 07:38:34 -04:00
|
|
|
size_t printed = 0;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
eols = zero_ended ? btowc('\0') : btowc('\n');
|
|
|
|
|
|
2012-11-01 07:38:34 -04:00
|
|
|
while (printed < BWSLEN(bws)) {
|
2021-07-05 09:32:48 -04:00
|
|
|
const wchar_t *s = bws->wdata.str + printed;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (*s == L'\0') {
|
|
|
|
|
int nums;
|
|
|
|
|
|
|
|
|
|
nums = fwprintf(f, L"%lc", *s);
|
|
|
|
|
|
|
|
|
|
if (nums != 1)
|
|
|
|
|
err(2, NULL);
|
|
|
|
|
++printed;
|
|
|
|
|
} else {
|
|
|
|
|
int nums;
|
|
|
|
|
|
|
|
|
|
nums = fwprintf(f, L"%ls", s);
|
|
|
|
|
|
|
|
|
|
if (nums < 1)
|
|
|
|
|
err(2, NULL);
|
|
|
|
|
printed += nums;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
fwprintf(f, L"%lc", eols);
|
|
|
|
|
return (printed + 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
bwsncmp(const struct bwstring *bws1, const struct bwstring *bws2,
|
|
|
|
|
size_t offset, size_t len)
|
|
|
|
|
{
|
|
|
|
|
size_t cmp_len, len1, len2;
|
2021-07-05 09:32:48 -04:00
|
|
|
int res;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
len1 = BWSLEN(bws1);
|
|
|
|
|
len2 = BWSLEN(bws2);
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (len1 <= offset) {
|
|
|
|
|
return ((len2 <= offset) ? 0 : -1);
|
|
|
|
|
} else {
|
|
|
|
|
if (len2 <= offset)
|
|
|
|
|
return (+1);
|
|
|
|
|
else {
|
|
|
|
|
len1 -= offset;
|
|
|
|
|
len2 -= offset;
|
|
|
|
|
|
|
|
|
|
cmp_len = len1;
|
|
|
|
|
|
|
|
|
|
if (len2 < cmp_len)
|
|
|
|
|
cmp_len = len2;
|
|
|
|
|
|
|
|
|
|
if (len < cmp_len)
|
|
|
|
|
cmp_len = len;
|
|
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
const char *s1, *s2;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s1 = bws1->cdata.str + offset;
|
|
|
|
|
s2 = bws2->cdata.str + offset;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
res = memcmp(s1, s2, cmp_len);
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
const wchar_t *s1, *s2;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s1 = bws1->wdata.str + offset;
|
|
|
|
|
s2 = bws2->wdata.str + offset;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
res = memcmp(s1, s2, SIZEOF_WCHAR_STRING(cmp_len));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (res == 0) {
|
|
|
|
|
if (len1 < cmp_len && len1 < len2)
|
|
|
|
|
res = -1;
|
|
|
|
|
else if (len2 < cmp_len && len2 < len1)
|
|
|
|
|
res = +1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (res);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
bwscmp(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset)
|
|
|
|
|
{
|
|
|
|
|
size_t len1, len2, cmp_len;
|
|
|
|
|
int res;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
len1 = BWSLEN(bws1);
|
|
|
|
|
len2 = BWSLEN(bws2);
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
len1 -= offset;
|
|
|
|
|
len2 -= offset;
|
|
|
|
|
|
|
|
|
|
cmp_len = len1;
|
|
|
|
|
|
|
|
|
|
if (len2 < cmp_len)
|
|
|
|
|
cmp_len = len2;
|
|
|
|
|
|
|
|
|
|
res = bwsncmp(bws1, bws2, offset, cmp_len);
|
|
|
|
|
|
|
|
|
|
if (res == 0) {
|
|
|
|
|
if( len1 < len2)
|
|
|
|
|
res = -1;
|
|
|
|
|
else if (len2 < len1)
|
|
|
|
|
res = +1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (res);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
bws_iterator_cmp(bwstring_iterator iter1, bwstring_iterator iter2, size_t len)
|
|
|
|
|
{
|
|
|
|
|
wchar_t c1, c2;
|
2021-07-05 09:32:48 -04:00
|
|
|
size_t i;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
for (i = 0; i < len; ++i) {
|
|
|
|
|
c1 = bws_get_iter_value(iter1);
|
|
|
|
|
c2 = bws_get_iter_value(iter2);
|
|
|
|
|
if (c1 != c2)
|
|
|
|
|
return (c1 - c2);
|
|
|
|
|
iter1 = bws_iterator_inc(iter1, 1);
|
|
|
|
|
iter2 = bws_iterator_inc(iter2, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
bwscoll(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset)
|
|
|
|
|
{
|
|
|
|
|
size_t len1, len2;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
len1 = BWSLEN(bws1);
|
|
|
|
|
len2 = BWSLEN(bws2);
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (len1 <= offset)
|
|
|
|
|
return ((len2 <= offset) ? 0 : -1);
|
|
|
|
|
else {
|
|
|
|
|
if (len2 <= offset)
|
|
|
|
|
return (+1);
|
|
|
|
|
else {
|
|
|
|
|
len1 -= offset;
|
|
|
|
|
len2 -= offset;
|
|
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
const char *s1, *s2;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s1 = bws1->cdata.str + offset;
|
|
|
|
|
s2 = bws2->cdata.str + offset;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (byte_sort) {
|
2021-07-05 09:32:48 -04:00
|
|
|
int res;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
if (len1 > len2) {
|
|
|
|
|
res = memcmp(s1, s2, len2);
|
|
|
|
|
if (!res)
|
|
|
|
|
res = +1;
|
|
|
|
|
} else if (len1 < len2) {
|
|
|
|
|
res = memcmp(s1, s2, len1);
|
|
|
|
|
if (!res)
|
|
|
|
|
res = -1;
|
|
|
|
|
} else
|
|
|
|
|
res = memcmp(s1, s2, len1);
|
|
|
|
|
|
|
|
|
|
return (res);
|
|
|
|
|
|
|
|
|
|
} else {
|
2021-07-05 09:32:48 -04:00
|
|
|
int res;
|
2012-05-11 08:37:16 -04:00
|
|
|
size_t i, maxlen;
|
|
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
|
maxlen = len1;
|
|
|
|
|
|
|
|
|
|
if (maxlen > len2)
|
|
|
|
|
maxlen = len2;
|
|
|
|
|
|
|
|
|
|
while (i < maxlen) {
|
|
|
|
|
/* goto next non-zero part: */
|
|
|
|
|
while ((i < maxlen) &&
|
|
|
|
|
!s1[i] && !s2[i])
|
|
|
|
|
++i;
|
|
|
|
|
|
|
|
|
|
if (i >= maxlen)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (s1[i] == 0) {
|
|
|
|
|
if (s2[i] == 0)
|
|
|
|
|
/* NOTREACHED */
|
|
|
|
|
err(2, "bwscoll error 01");
|
|
|
|
|
else
|
|
|
|
|
return (-1);
|
|
|
|
|
} else if (s2[i] == 0)
|
|
|
|
|
return (+1);
|
|
|
|
|
|
2012-11-01 07:38:34 -04:00
|
|
|
res = strcoll((const char*)(s1 + i), (const char*)(s2 + i));
|
2012-05-11 08:37:16 -04:00
|
|
|
if (res)
|
|
|
|
|
return (res);
|
|
|
|
|
|
|
|
|
|
while ((i < maxlen) &&
|
|
|
|
|
s1[i] && s2[i])
|
|
|
|
|
++i;
|
|
|
|
|
|
|
|
|
|
if (i >= maxlen)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (s1[i] == 0) {
|
|
|
|
|
if (s2[i] == 0) {
|
|
|
|
|
++i;
|
|
|
|
|
continue;
|
|
|
|
|
} else
|
|
|
|
|
return (-1);
|
|
|
|
|
} else if (s2[i] == 0)
|
|
|
|
|
return (+1);
|
|
|
|
|
else
|
|
|
|
|
/* NOTREACHED */
|
|
|
|
|
err(2, "bwscoll error 02");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len1 < len2)
|
|
|
|
|
return (-1);
|
|
|
|
|
else if (len1 > len2)
|
|
|
|
|
return (+1);
|
|
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
const wchar_t *s1, *s2;
|
|
|
|
|
size_t i, maxlen;
|
2021-07-05 09:32:48 -04:00
|
|
|
int res;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s1 = bws1->wdata.str + offset;
|
|
|
|
|
s2 = bws2->wdata.str + offset;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
|
maxlen = len1;
|
|
|
|
|
|
|
|
|
|
if (maxlen > len2)
|
|
|
|
|
maxlen = len2;
|
|
|
|
|
|
|
|
|
|
while (i < maxlen) {
|
|
|
|
|
|
|
|
|
|
/* goto next non-zero part: */
|
|
|
|
|
while ((i < maxlen) &&
|
|
|
|
|
!s1[i] && !s2[i])
|
|
|
|
|
++i;
|
|
|
|
|
|
|
|
|
|
if (i >= maxlen)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (s1[i] == 0) {
|
|
|
|
|
if (s2[i] == 0)
|
|
|
|
|
/* NOTREACHED */
|
|
|
|
|
err(2, "bwscoll error 1");
|
|
|
|
|
else
|
|
|
|
|
return (-1);
|
|
|
|
|
} else if (s2[i] == 0)
|
|
|
|
|
return (+1);
|
|
|
|
|
|
|
|
|
|
res = wide_str_coll(s1 + i, s2 + i);
|
|
|
|
|
if (res)
|
|
|
|
|
return (res);
|
|
|
|
|
|
|
|
|
|
while ((i < maxlen) && s1[i] && s2[i])
|
|
|
|
|
++i;
|
|
|
|
|
|
|
|
|
|
if (i >= maxlen)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (s1[i] == 0) {
|
|
|
|
|
if (s2[i] == 0) {
|
|
|
|
|
++i;
|
|
|
|
|
continue;
|
|
|
|
|
} else
|
|
|
|
|
return (-1);
|
|
|
|
|
} else if (s2[i] == 0)
|
|
|
|
|
return (+1);
|
|
|
|
|
else
|
|
|
|
|
/* NOTREACHED */
|
|
|
|
|
err(2, "bwscoll error 2");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len1 < len2)
|
|
|
|
|
return (-1);
|
|
|
|
|
else if (len1 > len2)
|
|
|
|
|
return (+1);
|
|
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Correction of the system API
|
|
|
|
|
*/
|
|
|
|
|
double
|
|
|
|
|
bwstod(struct bwstring *s0, bool *empty)
|
|
|
|
|
{
|
2021-07-05 09:32:48 -04:00
|
|
|
double ret;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
char *end, *s;
|
2012-05-11 08:37:16 -04:00
|
|
|
char *ep;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s = s0->cdata.str;
|
|
|
|
|
end = s + s0->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
ep = NULL;
|
|
|
|
|
|
|
|
|
|
while (isblank(*s) && s < end)
|
|
|
|
|
++s;
|
|
|
|
|
|
|
|
|
|
if (!isprint(*s)) {
|
|
|
|
|
*empty = true;
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-01 07:38:34 -04:00
|
|
|
ret = strtod((char*)s, &ep);
|
2021-07-05 09:32:48 -04:00
|
|
|
if (ep == s) {
|
2012-05-11 08:37:16 -04:00
|
|
|
*empty = true;
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
wchar_t *end, *ep, *s;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s = s0->wdata.str;
|
|
|
|
|
end = s + s0->wdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
ep = NULL;
|
|
|
|
|
|
|
|
|
|
while (iswblank(*s) && s < end)
|
|
|
|
|
++s;
|
|
|
|
|
|
|
|
|
|
if (!iswprint(*s)) {
|
|
|
|
|
*empty = true;
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = wcstod(s, &ep);
|
|
|
|
|
if (ep == s) {
|
|
|
|
|
*empty = true;
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*empty = false;
|
|
|
|
|
return (ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* A helper function for monthcoll. If a line matches
|
|
|
|
|
* a month name, it returns (number of the month - 1),
|
|
|
|
|
* while if there is no match, it just return -1.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
bws_month_score(const struct bwstring *s0)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
const char *end, *s;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s = s0->cdata.str;
|
|
|
|
|
end = s + s0->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (isblank(*s) && s < end)
|
|
|
|
|
++s;
|
|
|
|
|
|
|
|
|
|
for (int i = 11; i >= 0; --i) {
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
if (cmonths[i].mon && (s == strstr(s, cmonths[i].mon)))
|
|
|
|
|
return (i);
|
|
|
|
|
if (cmonths[i].ab && (s == strstr(s, cmonths[i].ab)))
|
|
|
|
|
return (i);
|
|
|
|
|
if (cmonths[i].alt && (s == strstr(s, cmonths[i].alt)))
|
2012-05-11 08:37:16 -04:00
|
|
|
return (i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
const wchar_t *end, *s;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s = s0->wdata.str;
|
|
|
|
|
end = s + s0->wdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (iswblank(*s) && s < end)
|
|
|
|
|
++s;
|
|
|
|
|
|
|
|
|
|
for (int i = 11; i >= 0; --i) {
|
sort: test against all month formats in month-sort
The CLDR specification [1] defines three possible month formats:
- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)
Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.
For example, in Greek, "May" can take the following forms:
Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)
If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.
This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.
[1] https://cldr.unicode.org/translation/date-time/date-time-patterns
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D42847
(cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
2023-11-30 19:30:10 -05:00
|
|
|
if (wmonths[i].ab && (s == wcsstr(s, wmonths[i].ab)))
|
|
|
|
|
return (i);
|
|
|
|
|
if (wmonths[i].mon && (s == wcsstr(s, wmonths[i].mon)))
|
|
|
|
|
return (i);
|
|
|
|
|
if (wmonths[i].alt && (s == wcsstr(s, wmonths[i].alt)))
|
2012-05-11 08:37:16 -04:00
|
|
|
return (i);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Rips out leading blanks (-b).
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
ignore_leading_blanks(struct bwstring *str)
|
|
|
|
|
{
|
|
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
char *dst, *end, *src;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
src = str->cdata.str;
|
2012-05-11 08:37:16 -04:00
|
|
|
dst = src;
|
2021-07-05 09:32:48 -04:00
|
|
|
end = src + str->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (src < end && isblank(*src))
|
|
|
|
|
++src;
|
|
|
|
|
|
|
|
|
|
if (src != dst) {
|
|
|
|
|
size_t newlen;
|
|
|
|
|
|
|
|
|
|
newlen = BWSLEN(str) - (src - dst);
|
|
|
|
|
|
|
|
|
|
while (src < end) {
|
|
|
|
|
*dst = *src;
|
|
|
|
|
++dst;
|
|
|
|
|
++src;
|
|
|
|
|
}
|
|
|
|
|
bws_setlen(str, newlen);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
wchar_t *dst, *end, *src;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
src = str->wdata.str;
|
2012-05-11 08:37:16 -04:00
|
|
|
dst = src;
|
2021-07-05 09:32:48 -04:00
|
|
|
end = src + str->wdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (src < end && iswblank(*src))
|
|
|
|
|
++src;
|
|
|
|
|
|
|
|
|
|
if (src != dst) {
|
|
|
|
|
|
|
|
|
|
size_t newlen = BWSLEN(str) - (src - dst);
|
|
|
|
|
|
|
|
|
|
while (src < end) {
|
|
|
|
|
*dst = *src;
|
|
|
|
|
++dst;
|
|
|
|
|
++src;
|
|
|
|
|
}
|
|
|
|
|
bws_setlen(str, newlen);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return (str);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Rips out nonprinting characters (-i).
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
ignore_nonprinting(struct bwstring *str)
|
|
|
|
|
{
|
2021-07-05 09:32:48 -04:00
|
|
|
size_t newlen = BWSLEN(str);
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
char *dst, *end, *src;
|
|
|
|
|
char c;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
src = str->cdata.str;
|
2012-05-11 08:37:16 -04:00
|
|
|
dst = src;
|
2021-07-05 09:32:48 -04:00
|
|
|
end = src + str->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (src < end) {
|
|
|
|
|
c = *src;
|
|
|
|
|
if (isprint(c)) {
|
|
|
|
|
*dst = c;
|
|
|
|
|
++dst;
|
|
|
|
|
++src;
|
|
|
|
|
} else {
|
|
|
|
|
++src;
|
|
|
|
|
--newlen;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
wchar_t *dst, *end, *src;
|
|
|
|
|
wchar_t c;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
src = str->wdata.str;
|
2012-05-11 08:37:16 -04:00
|
|
|
dst = src;
|
2021-07-05 09:32:48 -04:00
|
|
|
end = src + str->wdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (src < end) {
|
|
|
|
|
c = *src;
|
|
|
|
|
if (iswprint(c)) {
|
|
|
|
|
*dst = c;
|
|
|
|
|
++dst;
|
|
|
|
|
++src;
|
|
|
|
|
} else {
|
|
|
|
|
++src;
|
|
|
|
|
--newlen;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
bws_setlen(str, newlen);
|
|
|
|
|
|
|
|
|
|
return (str);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Rips out any characters that are not alphanumeric characters
|
|
|
|
|
* nor blanks (-d).
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
dictionary_order(struct bwstring *str)
|
|
|
|
|
{
|
2021-07-05 09:32:48 -04:00
|
|
|
size_t newlen = BWSLEN(str);
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
char *dst, *end, *src;
|
|
|
|
|
char c;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
src = str->cdata.str;
|
2012-05-11 08:37:16 -04:00
|
|
|
dst = src;
|
2021-07-05 09:32:48 -04:00
|
|
|
end = src + str->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (src < end) {
|
|
|
|
|
c = *src;
|
|
|
|
|
if (isalnum(c) || isblank(c)) {
|
|
|
|
|
*dst = c;
|
|
|
|
|
++dst;
|
|
|
|
|
++src;
|
|
|
|
|
} else {
|
|
|
|
|
++src;
|
|
|
|
|
--newlen;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
wchar_t *dst, *end, *src;
|
|
|
|
|
wchar_t c;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
src = str->wdata.str;
|
2012-05-11 08:37:16 -04:00
|
|
|
dst = src;
|
2021-07-05 09:32:48 -04:00
|
|
|
end = src + str->wdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (src < end) {
|
|
|
|
|
c = *src;
|
|
|
|
|
if (iswalnum(c) || iswblank(c)) {
|
|
|
|
|
*dst = c;
|
|
|
|
|
++dst;
|
|
|
|
|
++src;
|
|
|
|
|
} else {
|
|
|
|
|
++src;
|
|
|
|
|
--newlen;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
bws_setlen(str, newlen);
|
|
|
|
|
|
|
|
|
|
return (str);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Converts string to lower case(-f).
|
|
|
|
|
*/
|
|
|
|
|
struct bwstring *
|
|
|
|
|
ignore_case(struct bwstring *str)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1) {
|
2021-07-05 09:32:48 -04:00
|
|
|
char *end, *s;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s = str->cdata.str;
|
|
|
|
|
end = s + str->cdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (s < end) {
|
|
|
|
|
*s = toupper(*s);
|
|
|
|
|
++s;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
wchar_t *end, *s;
|
|
|
|
|
|
2021-07-05 09:32:48 -04:00
|
|
|
s = str->wdata.str;
|
|
|
|
|
end = s + str->wdata.len;
|
2012-05-11 08:37:16 -04:00
|
|
|
|
|
|
|
|
while (s < end) {
|
|
|
|
|
*s = towupper(*s);
|
|
|
|
|
++s;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return (str);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
bws_disorder_warnx(struct bwstring *s, const char *fn, size_t pos)
|
|
|
|
|
{
|
2015-04-05 22:35:55 -04:00
|
|
|
|
2021-05-13 08:55:06 -04:00
|
|
|
if (mb_cur_max == 1)
|
2021-07-05 09:32:48 -04:00
|
|
|
warnx("%s:%zu: disorder: %s", fn, pos + 1, s->cdata.str);
|
2012-05-11 08:37:16 -04:00
|
|
|
else
|
2021-07-05 09:32:48 -04:00
|
|
|
warnx("%s:%zu: disorder: %ls", fn, pos + 1, s->wdata.str);
|
2012-05-11 08:37:16 -04:00
|
|
|
}
|