mirror of
https://github.com/postgres/postgres.git
synced 2026-06-09 00:32:10 -04:00
Apply encoding conversion in COPY TO FORMAT JSON
CopyToJsonOneRow() sent the output of composite_to_json() directly
via CopySendData() without encoding conversion. The text and CSV
paths convert per-attribute via pg_server_to_any() when
need_transcoding is true, but the JSON path skipped this entirely.
This meant COPY ... TO ... WITH (FORMAT json, ENCODING 'LATIN1') on
a UTF-8 server silently produced UTF-8 output, and COPY TO STDOUT
with a non-UTF-8 client_encoding would send unconverted bytes to
the client.
Apply pg_server_to_any() to the whole JSON buffer after
composite_to_json() returns, converting to the requested file
encoding when it differs from the server encoding. Tests cover
both the explicit ENCODING option and the implicit case where
file_encoding is inherited from client_encoding.
Introduced by 7dadd38cda (json format for COPY TO).
Author: Ayush Tiwari <ayushtiwari.slg01@gmail.com>
Reviewed-by: Andrew Dunstan <andrew@dunslane.net>
Discussion: https://postgr.es/m/CAJTYsWX-jsLzxGRAb-dWnEpGYRPbDYHwce8LctVE92LiDfM2Jw@mail.gmail.com
This commit is contained in:
parent
45b02984e2
commit
7dc5bbcf22
3 changed files with 65 additions and 1 deletions
|
|
@ -427,7 +427,25 @@ CopyToJsonOneRow(CopyToState cstate, TupleTableSlot *slot)
|
|||
}
|
||||
}
|
||||
|
||||
CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len);
|
||||
/*
|
||||
* Convert the JSON output to the target encoding if needed. Unlike the
|
||||
* text and CSV paths which convert per-attribute via CopyAttributeOut*,
|
||||
* composite_to_json() emits the whole row as one buffer, so we transcode
|
||||
* it here in a single call before sending.
|
||||
*/
|
||||
if (cstate->need_transcoding)
|
||||
{
|
||||
char *converted;
|
||||
|
||||
converted = pg_server_to_any(cstate->json_buf->data,
|
||||
cstate->json_buf->len,
|
||||
cstate->file_encoding);
|
||||
CopySendData(cstate, converted, strlen(converted));
|
||||
if (converted != cstate->json_buf->data)
|
||||
pfree(converted);
|
||||
}
|
||||
else
|
||||
CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len);
|
||||
|
||||
CopySendTextLikeEndOfRow(cstate);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,4 +50,31 @@ COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
|
|||
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
|
||||
CONTEXT: COPY copy_encoding_tab, line 1
|
||||
RESET client_encoding;
|
||||
-- JSON format encoding conversion
|
||||
\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json'
|
||||
COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1');
|
||||
-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9).
|
||||
-- Done as separate position checks to stay independent of the platform's
|
||||
-- end-of-line convention.
|
||||
SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9,
|
||||
position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9;
|
||||
has_latin1_e9 | has_utf8_e9
|
||||
---------------+-------------
|
||||
t | f
|
||||
(1 row)
|
||||
|
||||
-- Same with implicit encoding inherited from client_encoding (no ENCODING
|
||||
-- option). Covers the case where a client with a non-UTF8 client_encoding
|
||||
-- runs COPY ... FORMAT json and would otherwise receive unconverted bytes.
|
||||
\set json_implicit :abs_builddir '/results/copyencoding_json_implicit_latin1.json'
|
||||
SET client_encoding TO LATIN1;
|
||||
COPY (SELECT E'\u00e9' AS c) TO :'json_implicit' WITH (FORMAT json);
|
||||
RESET client_encoding;
|
||||
SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_latin1_e9,
|
||||
position('\xc3a9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_utf8_e9;
|
||||
has_latin1_e9 | has_utf8_e9
|
||||
---------------+-------------
|
||||
t | f
|
||||
(1 row)
|
||||
|
||||
DROP TABLE copy_encoding_tab;
|
||||
|
|
|
|||
|
|
@ -57,4 +57,23 @@ SET client_encoding TO EUC_JP;
|
|||
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
|
||||
RESET client_encoding;
|
||||
|
||||
-- JSON format encoding conversion
|
||||
\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json'
|
||||
COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1');
|
||||
-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9).
|
||||
-- Done as separate position checks to stay independent of the platform's
|
||||
-- end-of-line convention.
|
||||
SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9,
|
||||
position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9;
|
||||
|
||||
-- Same with implicit encoding inherited from client_encoding (no ENCODING
|
||||
-- option). Covers the case where a client with a non-UTF8 client_encoding
|
||||
-- runs COPY ... FORMAT json and would otherwise receive unconverted bytes.
|
||||
\set json_implicit :abs_builddir '/results/copyencoding_json_implicit_latin1.json'
|
||||
SET client_encoding TO LATIN1;
|
||||
COPY (SELECT E'\u00e9' AS c) TO :'json_implicit' WITH (FORMAT json);
|
||||
RESET client_encoding;
|
||||
SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_latin1_e9,
|
||||
position('\xc3a9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_utf8_e9;
|
||||
|
||||
DROP TABLE copy_encoding_tab;
|
||||
|
|
|
|||
Loading…
Reference in a new issue