Skip to content

Commit

Permalink
Add header matching mode to COPY FROM
Browse files Browse the repository at this point in the history
COPY FROM supports the HEADER option to silently discard the header
line from a CSV or text file.  It is possible to load by mistake a
file that matches the expected format, for example, if two text
columns have been swapped, resulting in garbage in the database.

This adds a new option value HEADER MATCH that checks the column names
in the header line against the actual column names and errors out if
they do not match.

Author: Rémi Lapeyre <[email protected]>
Reviewed-by: Daniel Verite <[email protected]>
Reviewed-by: Peter Eisentraut <[email protected]>
Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com
  • Loading branch information
petere committed Mar 30, 2022
1 parent edcedcc commit 072132f
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 8 deletions.
19 changes: 18 additions & 1 deletion contrib/file_fdw/expected/file_fdw.out
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,21 @@ CREATE FOREIGN TABLE agg_bad (
) SERVER file_server
OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
-- test header matching
\set filename :abs_srcdir '/data/list1.csv'
CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_match;
1 | foo
---+-----
1 | bar
(1 row)

CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_doesnt_match; -- ERROR
ERROR: column name mismatch in header line field 1: got "1", expected "a"
CONTEXT: COPY header_doesnt_match, line 1: "1,foo"
-- per-column options tests
\set filename :abs_srcdir '/data/text.csv'
CREATE FOREIGN TABLE text_csv (
Expand Down Expand Up @@ -464,12 +479,14 @@ SET ROLE regress_file_fdw_superuser;
-- cleanup
RESET ROLE;
DROP EXTENSION file_fdw CASCADE;
NOTICE: drop cascades to 7 other objects
NOTICE: drop cascades to 9 other objects
DETAIL: drop cascades to server file_server
drop cascades to user mapping for regress_file_fdw_superuser on server file_server
drop cascades to user mapping for regress_no_priv_user on server file_server
drop cascades to foreign table agg_text
drop cascades to foreign table agg_csv
drop cascades to foreign table agg_bad
drop cascades to foreign table header_match
drop cascades to foreign table header_doesnt_match
drop cascades to foreign table text_csv
DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
9 changes: 9 additions & 0 deletions contrib/file_fdw/sql/file_fdw.sql
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ CREATE FOREIGN TABLE agg_bad (
OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
-- test header matching
\set filename :abs_srcdir '/data/list1.csv'
CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_match;
CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_doesnt_match; -- ERROR
-- per-column options tests
\set filename :abs_srcdir '/data/text.csv'
CREATE FOREIGN TABLE text_csv (
Expand Down
8 changes: 6 additions & 2 deletions doc/src/sgml/ref/copy.sgml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
NULL '<replaceable class="parameter">null_string</replaceable>'
HEADER [ <replaceable class="parameter">boolean</replaceable> ]
HEADER [ <replaceable class="parameter">boolean</replaceable> | <literal>match</literal> ]
QUOTE '<replaceable class="parameter">quote_character</replaceable>'
ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
FORCE_QUOTE { ( <replaceable class="parameter">column_name</replaceable> [, ...] ) | * }
Expand Down Expand Up @@ -276,7 +276,11 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
<para>
Specifies that the file contains a header line with the names of each
column in the file. On output, the first line contains the column
names from the table, and on input, the first line is ignored.
names from the table. On input, the first line is discarded when this
option is set to <literal>true</literal> (or equivalent Boolean value).
If this option is set to <literal>match</literal>, the number and names
of the columns in the header line must match the actual column names of
the table, otherwise an error is raised.
This option is not allowed when using <literal>binary</literal> format.
</para>
</listitem>
Expand Down
60 changes: 59 additions & 1 deletion src/backend/commands/copy.c
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,64 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt,
table_close(rel, NoLock);
}

/*
* Extract a CopyHeaderChoice value from a DefElem. This is like
* defGetBoolean() but also accepts the special value "match".
*/
static CopyHeaderChoice
defGetCopyHeaderChoice(DefElem *def)
{
/*
* If no parameter given, assume "true" is meant.
*/
if (def->arg == NULL)
return COPY_HEADER_TRUE;

/*
* Allow 0, 1, "true", "false", "on", "off", or "match".
*/
switch (nodeTag(def->arg))
{
case T_Integer:
switch (intVal(def->arg))
{
case 0:
return COPY_HEADER_FALSE;
case 1:
return COPY_HEADER_TRUE;
default:
/* otherwise, error out below */
break;
}
break;
default:
{
char *sval = defGetString(def);

/*
* The set of strings accepted here should match up with the
* grammar's opt_boolean_or_string production.
*/
if (pg_strcasecmp(sval, "true") == 0)
return COPY_HEADER_TRUE;
if (pg_strcasecmp(sval, "false") == 0)
return COPY_HEADER_FALSE;
if (pg_strcasecmp(sval, "on") == 0)
return COPY_HEADER_TRUE;
if (pg_strcasecmp(sval, "off") == 0)
return COPY_HEADER_FALSE;
if (pg_strcasecmp(sval, "match") == 0)
return COPY_HEADER_MATCH;
}
break;
}
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("%s requires a Boolean value or \"match\"",
def->defname)));
return COPY_HEADER_FALSE; /* keep compiler quiet */
}

/*
* Process the statement option list for COPY.
*
Expand Down Expand Up @@ -394,7 +452,7 @@ ProcessCopyOptions(ParseState *pstate,
if (header_specified)
errorConflictingDefElem(defel, pstate);
header_specified = true;
opts_out->header_line = defGetBoolean(defel);
opts_out->header_line = defGetCopyHeaderChoice(defel);
}
else if (strcmp(defel->defname, "quote") == 0)
{
Expand Down
53 changes: 50 additions & 3 deletions src/backend/commands/copyfromparse.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_bswap.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/rel.h"

Expand Down Expand Up @@ -758,12 +759,58 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
/* only available for text or csv input */
Assert(!cstate->opts.binary);

/* on input just throw the header line away */
/* on input check that the header line is correct if needed */
if (cstate->cur_lineno == 0 && cstate->opts.header_line)
{
ListCell *cur;
TupleDesc tupDesc;

tupDesc = RelationGetDescr(cstate->rel);

cstate->cur_lineno++;
if (CopyReadLine(cstate))
return false; /* done */
done = CopyReadLine(cstate);

if (cstate->opts.header_line == COPY_HEADER_MATCH)
{
int fldnum;

if (cstate->opts.csv_mode)
fldct = CopyReadAttributesCSV(cstate);
else
fldct = CopyReadAttributesText(cstate);

if (fldct != list_length(cstate->attnumlist))
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("wrong number of fields in header line: field count is %d, expected %d",
fldct, list_length(cstate->attnumlist))));

fldnum = 0;
foreach(cur, cstate->attnumlist)
{
int attnum = lfirst_int(cur);
char *colName = cstate->raw_fields[attnum - 1];
Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);

fldnum++;

if (colName == NULL)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
fldnum, cstate->opts.null_print, NameStr(attr->attname))));

if (namestrcmp(&attr->attname, colName) != 0) {
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
fldnum, colName, NameStr(attr->attname))));
}
}
}

if (done)
return false;
}

cstate->cur_lineno++;
Expand Down
13 changes: 12 additions & 1 deletion src/include/commands/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@
#include "parser/parse_node.h"
#include "tcop/dest.h"

/*
* Represents whether a header line should be present, and whether it must
* match the actual names (which implies "true").
*/
typedef enum CopyHeaderChoice
{
COPY_HEADER_FALSE = 0,
COPY_HEADER_TRUE,
COPY_HEADER_MATCH,
} CopyHeaderChoice;

/*
* A struct to hold COPY options, in a parsed form. All of these are related
* to formatting, except for 'freeze', which doesn't really belong here, but
Expand All @@ -32,7 +43,7 @@ typedef struct CopyFormatOptions
bool binary; /* binary format? */
bool freeze; /* freeze rows on loading? */
bool csv_mode; /* Comma Separated Value format? */
bool header_line; /* header line? */
CopyHeaderChoice header_line; /* header line? */
char *null_print; /* NULL marker string (server encoding!) */
int null_print_len; /* length of same */
char *null_print_client; /* same converted to file encoding */
Expand Down
23 changes: 23 additions & 0 deletions src/test/regress/expected/copy.out
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,26 @@ INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progre
drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
drop function notice_after_tab_progress_reporting();
drop table tab_progress_reporting;
-- Test header matching feature
create table header_copytest (
a int,
b int,
c text
);
copy header_copytest from stdin with (header wrong_choice);
ERROR: header requires a Boolean value or "match"
copy header_copytest from stdin with (header match);
copy header_copytest from stdin with (header match);
ERROR: column name mismatch in header line field 3: got null value ("\N"), expected "c"
CONTEXT: COPY header_copytest, line 1: "a b \N"
copy header_copytest from stdin with (header match);
ERROR: wrong number of fields in header line: field count is 2, expected 3
CONTEXT: COPY header_copytest, line 1: "a b"
copy header_copytest from stdin with (header match);
ERROR: wrong number of fields in header line: field count is 4, expected 3
CONTEXT: COPY header_copytest, line 1: "a b c d"
copy header_copytest from stdin with (header match);
ERROR: column name mismatch in header line field 3: got "d", expected "c"
CONTEXT: COPY header_copytest, line 1: "a b d"
copy header_copytest from stdin with (header match, format csv);
drop table header_copytest;
33 changes: 33 additions & 0 deletions src/test/regress/sql/copy.sql
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,36 @@ copy tab_progress_reporting from :'filename'
drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
drop function notice_after_tab_progress_reporting();
drop table tab_progress_reporting;

-- Test header matching feature
create table header_copytest (
a int,
b int,
c text
);
copy header_copytest from stdin with (header wrong_choice);
copy header_copytest from stdin with (header match);
a b c
1 2 foo
\.
copy header_copytest from stdin with (header match);
a b \N
1 2 foo
\.
copy header_copytest from stdin with (header match);
a b
1 2
\.
copy header_copytest from stdin with (header match);
a b c d
1 2 foo bar
\.
copy header_copytest from stdin with (header match);
a b d
1 2 foo
\.
copy header_copytest from stdin with (header match, format csv);
a,b,c
1,2,foo
\.
drop table header_copytest;

0 comments on commit 072132f

Please sign in to comment.