diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index fbcf7ca9c91ea..072a6dc1c1639 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -996,7 +996,7 @@ estimate_size(PlannerInfo *root, RelOptInfo *baserel,
/*
* Estimate the number of tuples in the file.
*/
- if (baserel->pages > 0)
+ if (baserel->tuples >= 0 && baserel->pages > 0)
{
/*
* We have # of pages and # of tuples from pg_class (that is, from a
diff --git a/contrib/hstore/Makefile b/contrib/hstore/Makefile
index 872ca03cd1fb0..72376d9007633 100644
--- a/contrib/hstore/Makefile
+++ b/contrib/hstore/Makefile
@@ -15,7 +15,7 @@ DATA = hstore--1.4.sql \
hstore--1.5--1.6.sql \
hstore--1.4--1.5.sql \
hstore--1.3--1.4.sql hstore--1.2--1.3.sql \
- hstore--1.1--1.2.sql hstore--1.0--1.1.sql
+ hstore--1.1--1.2.sql
PGFILEDESC = "hstore - key/value pair data type"
HEADERS = hstore.h
diff --git a/contrib/hstore/hstore--1.0--1.1.sql b/contrib/hstore/hstore--1.0--1.1.sql
deleted file mode 100644
index 4e32a575c5f68..0000000000000
--- a/contrib/hstore/hstore--1.0--1.1.sql
+++ /dev/null
@@ -1,7 +0,0 @@
-/* contrib/hstore/hstore--1.0--1.1.sql */
-
--- complain if script is sourced in psql, rather than via ALTER EXTENSION
-\echo Use "ALTER EXTENSION hstore UPDATE TO '1.1'" to load this file. \quit
-
-ALTER EXTENSION hstore DROP OPERATOR => (text, text);
-DROP OPERATOR => (text, text);
diff --git a/contrib/passwordcheck/passwordcheck.c b/contrib/passwordcheck/passwordcheck.c
index d5f9d14b01095..70f056232fe72 100644
--- a/contrib/passwordcheck/passwordcheck.c
+++ b/contrib/passwordcheck/passwordcheck.c
@@ -91,6 +91,9 @@ check_password(const char *username,
int i;
bool pwd_has_letter,
pwd_has_nonletter;
+#ifdef USE_CRACKLIB
+ const char *reason;
+#endif
/* enforce minimum length */
if (pwdlen < MIN_PWD_LENGTH)
@@ -125,10 +128,11 @@ check_password(const char *username,
#ifdef USE_CRACKLIB
/* call cracklib to check password */
- if (FascistCheck(password, CRACKLIB_DICTPATH))
+ if ((reason = FascistCheck(password, CRACKLIB_DICTPATH)))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("password is easily cracked")));
+ errmsg("password is easily cracked"),
+ errdetail_log("cracklib diagnostic: %s", reason)));
#endif
}
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index d797095458a47..c32ddc56fdbc4 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -46,6 +46,7 @@
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/acl.h"
+#include "utils/datetime.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/rel.h"
diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index 3a99333d44351..23306e11a78d6 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -195,6 +195,9 @@ statapprox_heap(Relation rel, output_type *stat)
stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
stat->tuple_count);
+ /* It's not clear if we could get -1 here, but be safe. */
+ stat->tuple_count = Max(stat->tuple_count, 0);
+
/*
* Calculate percentages if the relation has one or more pages.
*/
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 52d1fe356315e..08daf26fdf085 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -22,6 +22,7 @@
#include "postgres_fdw.h"
#include "storage/fd.h"
#include "storage/latch.h"
+#include "utils/datetime.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
#include "utils/memutils.h"
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 9fc53cad68038..a31abce7c9960 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -692,15 +692,14 @@ postgresGetForeignRelSize(PlannerInfo *root,
else
{
/*
- * If the foreign table has never been ANALYZEd, it will have relpages
- * and reltuples equal to zero, which most likely has nothing to do
- * with reality. We can't do a whole lot about that if we're not
+ * If the foreign table has never been ANALYZEd, it will have
+ * reltuples < 0, meaning "unknown". We can't do much if we're not
* allowed to consult the remote server, but we can use a hack similar
* to plancat.c's treatment of empty relations: use a minimum size
* estimate of 10 pages, and divide by the column-datatype-based width
* estimate to get the corresponding number of tuples.
*/
- if (baserel->pages == 0 && baserel->tuples == 0)
+ if (baserel->tuples < 0)
{
baserel->pages = 10;
baserel->tuples =
diff --git a/contrib/test_decoding/expected/ddl.out b/contrib/test_decoding/expected/ddl.out
index d79cd316b79fc..4ff0044c7879b 100644
--- a/contrib/test_decoding/expected/ddl.out
+++ b/contrib/test_decoding/expected/ddl.out
@@ -565,6 +565,35 @@ UPDATE table_with_unique_not_null SET data = 3 WHERE data = 2;
UPDATE table_with_unique_not_null SET id = -id;
UPDATE table_with_unique_not_null SET id = -id;
DELETE FROM table_with_unique_not_null WHERE data = 3;
+-- check tables with dropped indexes used in REPLICA IDENTITY
+-- table with primary key
+CREATE TABLE table_dropped_index_with_pk (a int PRIMARY KEY, b int, c int);
+CREATE UNIQUE INDEX table_dropped_index_with_pk_idx
+ ON table_dropped_index_with_pk(a);
+ALTER TABLE table_dropped_index_with_pk REPLICA IDENTITY
+ USING INDEX table_dropped_index_with_pk_idx;
+DROP INDEX table_dropped_index_with_pk_idx;
+INSERT INTO table_dropped_index_with_pk VALUES (1,1,1), (2,2,2), (3,3,3);
+UPDATE table_dropped_index_with_pk SET a = 4 WHERE a = 1;
+UPDATE table_dropped_index_with_pk SET b = 5 WHERE a = 2;
+UPDATE table_dropped_index_with_pk SET b = 6, c = 7 WHERE a = 3;
+DELETE FROM table_dropped_index_with_pk WHERE b = 1;
+DELETE FROM table_dropped_index_with_pk WHERE a = 3;
+DROP TABLE table_dropped_index_with_pk;
+-- table without primary key
+CREATE TABLE table_dropped_index_no_pk (a int NOT NULL, b int, c int);
+CREATE UNIQUE INDEX table_dropped_index_no_pk_idx
+ ON table_dropped_index_no_pk(a);
+ALTER TABLE table_dropped_index_no_pk REPLICA IDENTITY
+ USING INDEX table_dropped_index_no_pk_idx;
+DROP INDEX table_dropped_index_no_pk_idx;
+INSERT INTO table_dropped_index_no_pk VALUES (1,1,1), (2,2,2), (3,3,3);
+UPDATE table_dropped_index_no_pk SET a = 4 WHERE a = 1;
+UPDATE table_dropped_index_no_pk SET b = 5 WHERE a = 2;
+UPDATE table_dropped_index_no_pk SET b = 6, c = 7 WHERE a = 3;
+DELETE FROM table_dropped_index_no_pk WHERE b = 1;
+DELETE FROM table_dropped_index_no_pk WHERE a = 3;
+DROP TABLE table_dropped_index_no_pk;
-- check toast support
BEGIN;
CREATE SEQUENCE toasttable_rand_seq START 79 INCREMENT 1499; -- portable "random"
@@ -682,6 +711,46 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc
table public.table_with_unique_not_null: DELETE: id[integer]:4
COMMIT
BEGIN
+ table public.table_dropped_index_with_pk: INSERT: a[integer]:1 b[integer]:1 c[integer]:1
+ table public.table_dropped_index_with_pk: INSERT: a[integer]:2 b[integer]:2 c[integer]:2
+ table public.table_dropped_index_with_pk: INSERT: a[integer]:3 b[integer]:3 c[integer]:3
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_with_pk: UPDATE: a[integer]:4 b[integer]:1 c[integer]:1
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_with_pk: UPDATE: a[integer]:2 b[integer]:5 c[integer]:2
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_with_pk: UPDATE: a[integer]:3 b[integer]:6 c[integer]:7
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_with_pk: DELETE: (no-tuple-data)
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_with_pk: DELETE: (no-tuple-data)
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_no_pk: INSERT: a[integer]:1 b[integer]:1 c[integer]:1
+ table public.table_dropped_index_no_pk: INSERT: a[integer]:2 b[integer]:2 c[integer]:2
+ table public.table_dropped_index_no_pk: INSERT: a[integer]:3 b[integer]:3 c[integer]:3
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_no_pk: UPDATE: a[integer]:4 b[integer]:1 c[integer]:1
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_no_pk: UPDATE: a[integer]:2 b[integer]:5 c[integer]:2
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_no_pk: UPDATE: a[integer]:3 b[integer]:6 c[integer]:7
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_no_pk: DELETE: (no-tuple-data)
+ COMMIT
+ BEGIN
+ table public.table_dropped_index_no_pk: DELETE: (no-tuple-data)
+ COMMIT
+ BEGIN
table public.toasttable: INSERT: id[integer]:1 toasted_col1[text]:'12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000' rand1[double precision]:79 toasted_col2[text]:null rand2[double precision]:1578
COMMIT
BEGIN
@@ -690,7 +759,7 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc
BEGIN
table public.toasttable: UPDATE: id[integer]:1 toasted_col1[text]:'12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000' rand1[double precision]:79 toasted_col2[text]:null rand2[double precision]:1578
COMMIT
-(103 rows)
+(143 rows)
INSERT INTO toasttable(toasted_col1) SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i);
-- update of second column, first column unchanged
diff --git a/contrib/test_decoding/sql/ddl.sql b/contrib/test_decoding/sql/ddl.sql
index 2c4823e578057..1b3866d01530d 100644
--- a/contrib/test_decoding/sql/ddl.sql
+++ b/contrib/test_decoding/sql/ddl.sql
@@ -345,6 +345,37 @@ UPDATE table_with_unique_not_null SET id = -id;
UPDATE table_with_unique_not_null SET id = -id;
DELETE FROM table_with_unique_not_null WHERE data = 3;
+-- check tables with dropped indexes used in REPLICA IDENTITY
+-- table with primary key
+CREATE TABLE table_dropped_index_with_pk (a int PRIMARY KEY, b int, c int);
+CREATE UNIQUE INDEX table_dropped_index_with_pk_idx
+ ON table_dropped_index_with_pk(a);
+ALTER TABLE table_dropped_index_with_pk REPLICA IDENTITY
+ USING INDEX table_dropped_index_with_pk_idx;
+DROP INDEX table_dropped_index_with_pk_idx;
+INSERT INTO table_dropped_index_with_pk VALUES (1,1,1), (2,2,2), (3,3,3);
+UPDATE table_dropped_index_with_pk SET a = 4 WHERE a = 1;
+UPDATE table_dropped_index_with_pk SET b = 5 WHERE a = 2;
+UPDATE table_dropped_index_with_pk SET b = 6, c = 7 WHERE a = 3;
+DELETE FROM table_dropped_index_with_pk WHERE b = 1;
+DELETE FROM table_dropped_index_with_pk WHERE a = 3;
+DROP TABLE table_dropped_index_with_pk;
+
+-- table without primary key
+CREATE TABLE table_dropped_index_no_pk (a int NOT NULL, b int, c int);
+CREATE UNIQUE INDEX table_dropped_index_no_pk_idx
+ ON table_dropped_index_no_pk(a);
+ALTER TABLE table_dropped_index_no_pk REPLICA IDENTITY
+ USING INDEX table_dropped_index_no_pk_idx;
+DROP INDEX table_dropped_index_no_pk_idx;
+INSERT INTO table_dropped_index_no_pk VALUES (1,1,1), (2,2,2), (3,3,3);
+UPDATE table_dropped_index_no_pk SET a = 4 WHERE a = 1;
+UPDATE table_dropped_index_no_pk SET b = 5 WHERE a = 2;
+UPDATE table_dropped_index_no_pk SET b = 6, c = 7 WHERE a = 3;
+DELETE FROM table_dropped_index_no_pk WHERE b = 1;
+DELETE FROM table_dropped_index_no_pk WHERE a = 3;
+DROP TABLE table_dropped_index_no_pk;
+
-- check toast support
BEGIN;
CREATE SEQUENCE toasttable_rand_seq START 79 INCREMENT 1499; -- portable "random"
diff --git a/doc/src/sgml/advanced.sgml b/doc/src/sgml/advanced.sgml
index d77312600f7b6..2d4ab85d450c1 100644
--- a/doc/src/sgml/advanced.sgml
+++ b/doc/src/sgml/advanced.sgml
@@ -616,7 +616,7 @@ CREATE TABLE cities (
);
CREATE TABLE capitals (
- state char(2)
+ state char(2) UNIQUE NOT NULL
) INHERITS (cities);
@@ -630,7 +630,8 @@ CREATE TABLE capitals (
text, a native PostgreSQL
type for variable length character strings. The
capitals table has
- an extra column, state, which shows their states. In
+ an additional column, state, which shows its
+ state abbreviation. In
PostgreSQL, a table can inherit from
zero or more other tables.
diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml
index 55b6272db62e0..4420794e5bb52 100644
--- a/doc/src/sgml/brin.sgml
+++ b/doc/src/sgml/brin.sgml
@@ -120,354 +120,292 @@ LOG: request for BRIN range summarization for index "brin_wi_idx" page 128 was
@@ -576,7 +514,7 @@ typedef struct BrinOpcInfo
The options function is passed a pointer to a
- local_relopts struct, which needs to be
+ local_relopts struct, which needs to be
filled with a set of operator class specific options. The options
can be accessed from other support functions using the
PG_HAS_OPCLASS_OPTIONS() and
diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index d03ee4d6fa0d1..435b7cb24da94 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -566,7 +566,7 @@ equalimage(opcintypeoid) returns bool
options(reloptslocal_relopts *) returns void
- The function is passed a pointer to a local_relopts
+ The function is passed a pointer to a local_relopts
struct, which needs to be filled with a set of operator class
specific options. The options can be accessed from other support
functions using the PG_HAS_OPCLASS_OPTIONS() and
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 26fda20d19394..1d1b8ce8fb126 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1977,6 +1977,10 @@ SCRAM-SHA-256$<iteration count>:&l
the planner. It is updated by VACUUM,
ANALYZE, and a few DDL commands such as
CREATE INDEX.
+ If the table has never yet been vacuumed or
+ analyzed, reltuples
+ contains -1 indicating that the row count is
+ unknown.
@@ -9226,6 +9230,11 @@ SCRAM-SHA-256$<iteration count>:&l
available versions of extensions
+
+ pg_backend_memory_contexts
+ backend memory contexts
+
+
pg_configcompile-time configuration parameters
@@ -9577,6 +9586,127 @@ SCRAM-SHA-256$<iteration count>:&l
+
+ pg_backend_memory_contexts
+
+
+ pg_backend_memory_contexts
+
+
+
+ The view pg_backend_memory_contexts displays all
+ the memory contexts of the server process attached to the current session.
+
+
+ pg_backend_memory_contexts contains one row
+ for each memory context.
+
+
+
+ pg_backend_memory_contexts Columns
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ nametext
+
+
+ Name of the memory context
+
+
+
+
+
+ identtext
+
+
+ Identification information of the memory context. This field is truncated at 1024 bytes
+
+
+
+
+
+ parenttext
+
+
+ Name of the parent of this memory context
+
+
+
+
+
+ levelint4
+
+
+ Distance from TopMemoryContext in context tree
+
+
+
+
+
+ total_bytesint8
+
+
+ Total bytes allocated for this memory context
+
+
+
+
+
+ total_nblocksint8
+
+
+ Total number of blocks allocated for this memory context
+
+
+
+
+
+ free_bytesint8
+
+
+ Free space in bytes
+
+
+
+
+
+ free_chunksint8
+
+
+ Total number of free chunks
+
+
+
+
+
+ used_bytesint8
+
+
+ Used space in bytes
+
+
+
+
+
+
+
+ By default, the pg_backend_memory_contexts view can be
+ read only by superusers.
+
+
+
pg_config
@@ -10226,7 +10356,8 @@ SCRAM-SHA-256$<iteration count>:&l
and general database objects (identified by class OID and object OID,
in the same way as in pg_description or
pg_depend). Also, the right to extend a
- relation is represented as a separate lockable object.
+ relation is represented as a separate lockable object, as is the right to
+ update pg_database.datfrozenxid.
Also, advisory locks can be taken on numbers that have
user-defined meanings.
@@ -10254,6 +10385,7 @@ SCRAM-SHA-256$<iteration count>:&l
Type of the lockable object:
relation,
extend,
+ frozenid,
page,
tuple,
transactionid,
diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml
index bbf50b76f8c3d..39fbc39cb0ddb 100644
--- a/doc/src/sgml/datetime.sgml
+++ b/doc/src/sgml/datetime.sgml
@@ -564,8 +564,8 @@
- PostgreSQL can accept time zone specifications that
- are written according to the POSIX standard's rules
+ PostgreSQL can accept time zone specifications
+ that are written according to the POSIX standard's rules
for the TZ environment
variable. POSIX time zone specifications are
inadequate to deal with the complexity of real-world time zone history,
@@ -635,8 +635,8 @@
or -). The positive sign is used for
zones west of Greenwich. (Note that this is the
opposite of the ISO-8601 sign convention used elsewhere in
- PostgreSQL.) hh can have
- one or two digits; mm
+ PostgreSQL.) hh
+ can have one or two digits; mm
and ss (if used) must have two.
diff --git a/doc/src/sgml/dml.sgml b/doc/src/sgml/dml.sgml
index 97a773095540d..3844e34a7dcce 100644
--- a/doc/src/sgml/dml.sgml
+++ b/doc/src/sgml/dml.sgml
@@ -262,7 +262,7 @@ DELETE FROM products;
- Returning Data From Modified Rows
+ Returning Data from Modified RowsRETURNING
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index 74793035d7f54..72fa1272120d8 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -130,7 +130,8 @@ GetForeignRelSize(PlannerInfo *root,
(The initial value is
from pg_class.reltuples
which represents the total row count seen by the
- last ANALYZE.)
+ last ANALYZE; it will be -1 if
+ no ANALYZE has been done on this foreign table.)
diff --git a/doc/src/sgml/file-fdw.sgml b/doc/src/sgml/file-fdw.sgml
index ed028e4ec9426..d985ef0a069f7 100644
--- a/doc/src/sgml/file-fdw.sgml
+++ b/doc/src/sgml/file-fdw.sgml
@@ -28,7 +28,8 @@
- Specifies the file to be read. Must be an absolute path name.
+ Specifies the file to be read. Relative paths are relative to the
+ data directory.
Either filename or program must be
specified, but not both.
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 9a4ac5a1ea368..b9f591296a5d0 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1055,6 +1055,7 @@ repeat('Pg', 4) PgPgPgPg
Factorial
+ (deprecated, use factorial() instead)
5 !
@@ -1068,7 +1069,8 @@ repeat('Pg', 4) PgPgPgPgnumeric
- Factorial (as a prefix operator)
+ Factorial as a prefix operator
+ (deprecated, use factorial() instead)
!! 5
@@ -1349,7 +1351,7 @@ repeat('Pg', 4) PgPgPgPg
-
+ factorialfactorial ( bigint )
@@ -6876,7 +6878,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
- Differences From XQuery (LIKE_REGEX)
+ Differences from XQuery (LIKE_REGEX)LIKE_REGEX
@@ -14101,7 +14103,7 @@ SELECT xmltable.*
size_sq_km float PATH 'SIZE[@unit = "sq_km"]',
size_other text PATH
'concat(SIZE[@unit!="sq_km"], " ", SIZE[@unit!="sq_km"]/@unit)',
- premier_name text PATH 'PREMIER_NAME' DEFAULT 'not specified') ;
+ premier_name text PATH 'PREMIER_NAME' DEFAULT 'not specified');
id | ordinality | COUNTRY_NAME | country_id | size_sq_km | size_other | premier_name
----+------------+--------------+------------+------------+--------------+---------------
diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml
index 07114f77199ce..5c8d4d52757cf 100644
--- a/doc/src/sgml/gin.sgml
+++ b/doc/src/sgml/gin.sgml
@@ -75,53 +75,62 @@
Built-in GIN Operator Classes
-
+ Name
- Indexed Data TypeIndexable Operators
- array_ops
- anyarray
-
- &&
- <@
- =
- @>
-
+ array_ops
+ && (anyarray,anyarray)
- jsonb_ops
- jsonb
-
- ?
- ?&
- ?|
- @>
- @?
- @@
-
+ @> (anyarray,anyarray)
- jsonb_path_ops
- jsonb
-
- @>
- @?
- @@
-
+ <@ (anyarray,anyarray)
- tsvector_ops
- tsvector
-
- @@
- @@@
-
+ = (anyarray,anyarray)
+
+
+ jsonb_ops
+ @> (jsonb,jsonb)
+
+
+ @? (jsonb,jsonpath)
+
+
+ @@ (jsonb,jsonpath)
+
+
+ ? (jsonb,text)
+
+
+ ?| (jsonb,text[])
+
+
+ ?& (jsonb,text[])
+
+
+ jsonb_path_ops
+ @> (jsonb,jsonb)
+
+
+ @? (jsonb,jsonpath)
+
+
+ @@ (jsonb,jsonpath)
+
+
+ tsvector_ops
+ @@ (tsvector,tsquery)
+
+
+ @@@ (tsvector,tsquery)
@@ -412,7 +421,7 @@
The options function is passed a pointer to a
- local_relopts struct, which needs to be
+ local_relopts struct, which needs to be
filled with a set of operator class specific options. The options
can be accessed from other support functions using the
PG_HAS_OPCLASS_OPTIONS() and
diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml
index 5d970ee9f2f45..f9226e7a35cbb 100644
--- a/doc/src/sgml/gist.sgml
+++ b/doc/src/sgml/gist.sgml
@@ -53,157 +53,126 @@
Built-in GiST Operator Classes
-
+ Name
- Indexed Data TypeIndexable OperatorsOrdering Operators
- box_ops
- box
-
- &&
- &>
- &<
- &<|
- >>
- <<
- <<|
- <@
- @>
- @
- |&>
- |>>
- ~
- ~=
-
-
- <->
-
+ box_ops
+ << (box,box)
+ <-> (box,point)
+ &< (box,box)
+ && (box,box)
+ &> (box,box)
+ >> (box,box)
+ ~= (box,box)
+ @> (box,box)
+ <@ (box,box)
+ &<| (box,box)
+ <<| (box,box)
+ |>> (box,box)
+ |&> (box,box)
+ ~ (box,box)
+ @ (box,box)
+
- circle_ops
- circle
-
- &&
- &>
- &<
- &<|
- >>
- <<
- <<|
- <@
- @>
- @
- |&>
- |>>
- ~
- ~=
-
-
- <->
-
+ circle_ops
+ << (circle,circle)
+ <-> (circle,point)
+ &< (circle,circle)
+ &> (circle,circle)
+ >> (circle,circle)
+ <@ (circle,circle)
+ @> (circle,circle)
+ ~= (circle,circle)
+ && (circle,circle)
+ |>> (circle,circle)
+ <<| (circle,circle)
+ &<| (circle,circle)
+ |&> (circle,circle)
+ @ (circle,circle)
+ ~ (circle,circle)
+
- inet_ops
- inet, cidr
-
- &&
- >>
- >>=
- >
- >=
- <>
- <<
- <<=
- <
- <=
- =
-
-
-
+ inet_ops
+ << (inet,inet)
+
+ <<= (inet,inet)
+ >> (inet,inet)
+ >>= (inet,inet)
+ = (inet,inet)
+ <> (inet,inet)
+ < (inet,inet)
+ <= (inet,inet)
+ > (inet,inet)
+ >= (inet,inet)
+ && (inet,inet)
+
- point_ops
- point
-
- >>
- >^
- <<
- <@
- <@
- <@
- <^
- ~=
-
-
- <->
-
+ point_ops
+ >^ (point,point)
+ <-> (point,point)
+ << (point,point)
+ >> (point,point)
+ <^ (point,point)
+ ~= (point,point)
+ <@ (point,box)
+ <@ (point,polygon)
+ <@ (point,circle)
+
- poly_ops
- polygon
-
- &&
- &>
- &<
- &<|
- >>
- <<
- <<|
- <@
- @>
- @
- |&>
- |>>
- ~
- ~=
-
-
- <->
-
+ poly_ops
+ << (polygon,polygon)
+ <-> (polygon,point)
+ &< (polygon,polygon)
+ &> (polygon,polygon)
+ >> (polygon,polygon)
+ <@ (polygon,polygon)
+ @> (polygon,polygon)
+ ~= (polygon,polygon)
+ && (polygon,polygon)
+ <<| (polygon,polygon)
+ &<| (polygon,polygon)
+ |&> (polygon,polygon)
+ |>> (polygon,polygon)
+ @ (polygon,polygon)
+ ~ (polygon,polygon)
+
- range_ops
- any range type
-
- &&
- &>
- &<
- >>
- <<
- <@
- -|-
- =
- @>
- @>
-
-
-
+ range_ops
+ = (anyrange,anyrange)
+
+ && (anyrange,anyrange)
+ @> (anyrange,anyelement)
+ @> (anyrange,anyrange)
+ <@ (anyrange,anyrange)
+ << (anyrange,anyrange)
+ >> (anyrange,anyrange)
+ &< (anyrange,anyrange)
+ &> (anyrange,anyrange)
+ -|- (anyrange,anyrange)
+
- tsquery_ops
- tsquery
-
- <@
- @>
-
-
-
+ tsquery_ops
+ <@ (tsquery,tsquery)
+
+ @> (tsquery,tsquery)
- tsvector_ops
- tsvector
-
- @@
-
-
-
+ tsvector_ops
+ @@ (tsvector,tsquery)
+
@@ -962,7 +931,7 @@ LANGUAGE C STRICT;
- The function is passed a pointer to a local_relopts
+ The function is passed a pointer to a local_relopts
struct, which needs to be filled with a set of operator class
specific options. The options can be accessed from other support
functions using the PG_HAS_OPCLASS_OPTIONS() and
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index a824d383f2d89..d6f79fc435ea1 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -2380,9 +2380,10 @@ LOG: database system is ready to accept read only connections
- The background writer is active during recovery and will perform
- restartpoints (similar to checkpoints on the primary) and normal block
- cleaning activities. This can include updates of the hint bit
+ The checkpointer process and the background writer process are active during
+ recovery. The checkpointer process will perform restartpoints (similar to
+ checkpoints on the primary) and the background writer process will perform
+ normal block cleaning activities. This can include updates of the hint bit
information stored on the standby server.
The CHECKPOINT command is accepted during recovery,
though it performs a restartpoint rather than a new checkpoint.
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index f7b765f76dc9b..92556c7ce0cc0 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -781,7 +781,7 @@ PGPing PQping(const char *conninfo);
PQsetSSLKeyPassHook_OpenSSL lets an application override
- libpq's default
+ libpq's default
handling of encrypted client certificate key files using
or interactive prompting.
@@ -793,20 +793,23 @@ void PQsetSSLKeyPassHook_OpenSSL(PQsslKeyPassHook_OpenSSL_type hook);
int callback_fn(char *buf, int size, PGconn *conn);
- which libpq will then call instead of
- its default PQdefaultSSLKeyPassHook_OpenSSL handler. The callback
- should determine the password for the key and copy it to result-buffer
- buf of size size. The string in
- buf must be null-terminated. The callback must return the length of
- the password stored in buf excluding the null terminator.
- On failure, the callback should set buf[0] = '\0' and return 0.
- See PQdefaultSSLKeyPassHook_OpenSSL in libpq's
- source code for an example.
-
-
+ which libpq will then call
+ instead of its default
+ PQdefaultSSLKeyPassHook_OpenSSL handler. The
+ callback should determine the password for the key and copy it to
+ result-buffer buf of size
+ size. The string in buf
+ must be null-terminated. The callback must return the length of the
+ password stored in buf excluding the null
+ terminator. On failure, the callback should set
+ buf[0] = '\0' and return 0. See
+ PQdefaultSSLKeyPassHook_OpenSSL in
+ libpq's source code for an example.
+
+
If the user specified an explicit key location,
- its path will be in conn->pgsslkey when the callback
+ its path will be in conn->sslkey when the callback
is invoked. This will be empty if the default key path is being used.
For keys that are engine specifiers, it is up to engine implementations
whether they use the OpenSSL password callback or define their own handling.
@@ -7877,7 +7880,7 @@ ldap://ldap.acme.com/cn=dbserver,cn=hosts?pgconnectinfo?base?(objectclass=*)
~/.postgresql/postgresql.crtclient certificate
- requested by server
+ sent to server
diff --git a/doc/src/sgml/manage-ag.sgml b/doc/src/sgml/manage-ag.sgml
index 01453e6dae72e..74055a4706557 100644
--- a/doc/src/sgml/manage-ag.sgml
+++ b/doc/src/sgml/manage-ag.sgml
@@ -33,21 +33,41 @@
- When connecting to the database server, a client must specify in
- its connection request the name of the database it wants to connect
- to. It is not possible to access more than one database per
- connection. However, an application is not restricted in the number of
- connections it opens to the same or other databases. Databases are
- physically separated and access control is managed at the
- connection level. If one PostgreSQL server
- instance is to house projects or users that should be separate and
- for the most part unaware of each other, it is therefore
- recommended to put them into separate databases. If the projects
- or users are interrelated and should be able to use each other's
- resources, they should be put in the same database but possibly
- into separate schemas. Schemas are a purely logical structure and who can
- access what is managed by the privilege system. More information about
- managing schemas is in .
+ When connecting to the database server, a client must specify the
+ database name in its connection request.
+ It is not possible to access more than one database per
+ connection. However, clients can open multiple connections to
+ the same database, or different databases.
+ Database-level security has two components: access control
+ (see ), managed at the
+ connection level, and authorization control
+ (see ), managed via the grant system.
+ Foreign data wrappers (see )
+ allow for objects within one database to act as proxies for objects in
+ other database or clusters.
+ The older dblink module (see ) provides a similar capability.
+ By default, all users can connect to all databases using all connection methods.
+
+
+
+ If one PostgreSQL server cluster is planned to contain
+ unrelated projects or users that should be, for the most part, unaware
+ of each other, it is recommended to put them into separate databases and
+ adjust authorizations and access controls accordingly.
+ If the projects or users are interrelated, and thus should be able to use
+ each other's resources, they should be put in the same database but probably
+ into separate schemas; this provides a modular structure with namespace
+ isolation and authorization control.
+ More information about managing schemas is in .
+
+
+
+ While multiple databases can be created within a single cluster, it is advised
+ to consider carefully whether the benefits outweigh the risks and limitations.
+ In particular, the impact that having a shared WAL (see )
+ has on backup and recovery options. While individual databases in the cluster
+ are isolated when considered from the user's perspective, they are closely bound
+ from the database administrator's point-of-view.
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 7dcddf478a112..17a0df697848e 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1202,6 +1202,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
BufFileWriteWaiting for a write to a buffered file.
+
+ BufFileTruncate
+ Waiting for a buffered file to be truncated.
+ ControlFileReadWaiting for a read from the pg_control
@@ -1742,6 +1746,12 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
extendWaiting to extend a relation.
+
+ frozenid
+ Waiting to
+ update pg_database.datfrozenxid
+ and pg_database.datminmxid.
+ objectWaiting to acquire a lock on a non-relation database object.
@@ -1910,6 +1920,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
NotifyQueueWaiting to read or update NOTIFY messages.
+
+ NotifyQueueTail
+ Waiting to update limit on NOTIFY message
+ storage.
+ NotifySLRUWaiting to access the NOTIFY message SLRU
@@ -2086,6 +2101,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
WALWriteWaiting for WAL buffers to be written to disk.
+
+ WrapLimitsVacuum
+ Waiting to update limits on transaction id and multixact
+ consumption.
+ XactBufferWaiting for I/O on a transaction status SLRU buffer.
@@ -4428,7 +4448,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
nametext
- name of the SLRU
+ Name of the SLRU
@@ -4632,7 +4652,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
argument. The argument can be bgwriter to reset
all the counters shown in
the pg_stat_bgwriter
- view,or archiver to reset all the counters shown in
+ view, or archiver to reset all the counters shown in
the pg_stat_archiver view.
@@ -5172,8 +5192,8 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
finalizing analyze
- The command is updating pg_class. When this phase is completed,
- ANALYZE will end.
+ The command is updating pg_class. When this
+ phase is completed, ANALYZE will end.
diff --git a/doc/src/sgml/passwordcheck.sgml b/doc/src/sgml/passwordcheck.sgml
index 4128b6cc4f6f3..0d89bb95b9de4 100644
--- a/doc/src/sgml/passwordcheck.sgml
+++ b/doc/src/sgml/passwordcheck.sgml
@@ -25,7 +25,7 @@
You can adapt this module to your needs by changing the source code.
For example, you can use
- CrackLib
+ CrackLib
to check passwords — this only requires uncommenting
two lines in the Makefile and rebuilding the
module. (We cannot include CrackLib
diff --git a/doc/src/sgml/plhandler.sgml b/doc/src/sgml/plhandler.sgml
index e1b0af7a60d17..40ee59de9f341 100644
--- a/doc/src/sgml/plhandler.sgml
+++ b/doc/src/sgml/plhandler.sgml
@@ -96,62 +96,10 @@
- This is a template for a procedural-language handler written in C:
-
-#include "postgres.h"
-#include "executor/spi.h"
-#include "commands/trigger.h"
-#include "fmgr.h"
-#include "access/heapam.h"
-#include "utils/syscache.h"
-#include "catalog/pg_proc.h"
-#include "catalog/pg_type.h"
-
-PG_MODULE_MAGIC;
-
-PG_FUNCTION_INFO_V1(plsample_call_handler);
-
-Datum
-plsample_call_handler(PG_FUNCTION_ARGS)
-{
- Datum retval;
-
- if (CALLED_AS_TRIGGER(fcinfo))
- {
- /*
- * Called as a trigger function
- */
- TriggerData *trigdata = (TriggerData *) fcinfo->context;
-
- retval = ...
- }
- else
- {
- /*
- * Called as a function
- */
-
- retval = ...
- }
-
- return retval;
-}
-
- Only a few thousand lines of code have to be added instead of the
- dots to complete the call handler.
-
-
-
- After having compiled the handler function into a loadable module
- (see ), the following commands then
- register the sample procedural language:
-
-CREATE FUNCTION plsample_call_handler() RETURNS language_handler
- AS 'filename'
- LANGUAGE C;
-CREATE LANGUAGE plsample
- HANDLER plsample_call_handler;
-
+ A template for a procedural-language handler written as a C extension is
+ provided in src/test/modules/plsample. This is a
+ working sample demonstrating one way to create a procedural-language
+ handler, process parameters, and return a value.
diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index d5c1654b16e4f..815912666dd08 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -1657,7 +1657,7 @@ END;
- Returning From a Function
+ Returning from a Function
There are two commands available that allow you to return data
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 8b00235a5161b..0c7087397d736 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -1742,8 +1742,9 @@ simple query protocol can be used.
For the purpose of testing replication commands, you can make a replication
- connection via psql or any other libpq-using
- tool with a connection string including the replication option,
+ connection via psql or any other
+ libpq-using tool with a connection string including
+ the replication option,
e.g.:
psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
diff --git a/doc/src/sgml/ref/comment.sgml b/doc/src/sgml/ref/comment.sgml
index 965c5a40ad72a..fd7492a25567e 100644
--- a/doc/src/sgml/ref/comment.sgml
+++ b/doc/src/sgml/ref/comment.sgml
@@ -306,7 +306,7 @@ COMMENT ON TABLE mytable IS NULL;
Some more examples:
-COMMENT ON ACCESS METHOD rtree IS 'R-Tree access method';
+COMMENT ON ACCESS METHOD gin IS 'GIN index access method';
COMMENT ON AGGREGATE my_aggregate (double precision) IS 'Computes sample variance';
COMMENT ON CAST (text AS int4) IS 'Allow casts from text to int4';
COMMENT ON COLLATION "fr_CA" IS 'Canadian French';
@@ -316,6 +316,7 @@ COMMENT ON CONSTRAINT bar_col_cons ON bar IS 'Constrains column col';
COMMENT ON CONSTRAINT dom_col_constr ON DOMAIN dom IS 'Constrains col of domain';
COMMENT ON DATABASE my_database IS 'Development Database';
COMMENT ON DOMAIN my_domain IS 'Email Address Domain';
+COMMENT ON EVENT TRIGGER abort_ddl IS 'Aborts all DDL commands';
COMMENT ON EXTENSION hstore IS 'implements the hstore data type';
COMMENT ON FOREIGN DATA WRAPPER mywrapper IS 'my foreign data wrapper';
COMMENT ON FOREIGN TABLE my_foreign_table IS 'Employee Information in other database';
@@ -330,12 +331,15 @@ COMMENT ON OPERATOR CLASS int4ops USING btree IS '4 byte integer operators for b
COMMENT ON OPERATOR FAMILY integer_ops USING btree IS 'all integer operators for btrees';
COMMENT ON POLICY my_policy ON mytable IS 'Filter rows by users';
COMMENT ON PROCEDURE my_proc (integer, integer) IS 'Runs a report';
+COMMENT ON PUBLICATION alltables IS 'Publishes all operations on all tables';
COMMENT ON ROLE my_role IS 'Administration group for finance tables';
+COMMENT ON ROUTINE my_routine (integer, integer) IS 'Runs a routine (which is a function or procedure)';
COMMENT ON RULE my_rule ON my_table IS 'Logs updates of employee records';
COMMENT ON SCHEMA my_schema IS 'Departmental data';
COMMENT ON SEQUENCE my_sequence IS 'Used to generate primary keys';
COMMENT ON SERVER myserver IS 'my foreign server';
COMMENT ON STATISTICS my_statistics IS 'Improves planner row estimations';
+COMMENT ON SUBSCRIPTION alltables IS 'Subscription for all operations on all tables';
COMMENT ON TABLE my_schema.my_table IS 'Employee Information';
COMMENT ON TABLESPACE my_tablespace IS 'Tablespace for indexes';
COMMENT ON TEXT SEARCH CONFIGURATION my_config IS 'Special word filtering';
diff --git a/doc/src/sgml/ref/create_operator.sgml b/doc/src/sgml/ref/create_operator.sgml
index d5c385c087f5c..66c34e0072f0d 100644
--- a/doc/src/sgml/ref/create_operator.sgml
+++ b/doc/src/sgml/ref/create_operator.sgml
@@ -87,11 +87,18 @@ CREATE OPERATOR name (
At least one of LEFTARG and RIGHTARG must be defined. For
- binary operators, both must be defined. For right unary
+ binary operators, both must be defined. For right unary
operators, only LEFTARG should be defined, while for left
unary operators only RIGHTARG should be defined.
+
+
+ Right unary, also called postfix, operators are deprecated and will be
+ removed in PostgreSQL version 14.
+
+
+
The function_name
function must have been previously defined using CREATE
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 1c19e254dc240..906b2ccd50a2f 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -187,8 +187,7 @@ ROLLBACK;
query processing.
The number of blocks shown for an
upper-level node includes those used by all its child nodes. In text
- format, only non-zero values are printed. This parameter may only be
- used when ANALYZE is also enabled. It defaults to
+ format, only non-zero values are printed. It defaults to
FALSE.
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index e246efbdb5207..aa0b27c9f300f 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -29,51 +29,51 @@ PostgreSQL documentation
Description
- pg_basebackup is used to take base backups of
- a running PostgreSQL database cluster. These
- are taken without affecting other clients to the database, and can be used
+ pg_basebackup is used to take a base backup of
+ a running PostgreSQL database cluster. The backup
+ is taken without affecting other clients of the database, and can be used
both for point-in-time recovery (see )
- and as the starting point for a log shipping or streaming replication standby
- servers (see ).
+ and as the starting point for a log-shipping or streaming-replication standby
+ server (see ).
- pg_basebackup makes a binary copy of the database
- cluster files, while making sure the system is put in and
+ pg_basebackup makes an exact copy of the database
+ cluster's files, while making sure the server is put into and
out of backup mode automatically. Backups are always taken of the entire
database cluster; it is not possible to back up individual databases or
- database objects. For individual database backups, a tool such as
+ database objects. For selective backups, another tool such as
must be used.
The backup is made over a regular PostgreSQL
- connection, and uses the replication protocol. The connection must be made
- with a user having REPLICATION permissions
- (see ) or a superuser,
- and pg_hba.conf must explicitly permit the replication
- connection. The server must also be configured
- with set high enough to leave at least
- one session available for the backup and one for WAL streaming (if used).
+ connection that uses the replication protocol. The connection must be made
+ with a user ID that has REPLICATION permissions
+ (see ) or is a superuser,
+ and pg_hba.conf
+ must permit the replication connection. The server must also be configured
+ with set high enough to provide at
+ least one walsender for the backup plus one for WAL streaming (if used).
- There can be multiple pg_basebackups running at the same time, but it is
+ There can be multiple pg_basebackups running at the same time, but it is usually
better from a performance point of view to take only one backup, and copy
the result.
pg_basebackup can make a base backup from
- not only the primary but also the standby. To take a backup from the standby,
+ not only a primary server but also a standby. To take a backup from a standby,
set up the standby so that it can accept replication connections (that is, set
max_wal_senders and ,
- and configure host-based authentication).
+ and configure its pg_hba.conf appropriately).
You will also need to enable on the primary.
- Note that there are some limitations in an online backup from the standby:
+ Note that there are some limitations in taking a backup from a standby:
@@ -89,7 +89,7 @@ PostgreSQL documentation
- If the standby is promoted to the primary during online backup, the backup fails.
+ If the standby is promoted to be primary during backup, the backup fails.
@@ -105,7 +105,7 @@ PostgreSQL documentation
Whenever pg_basebackup is taking a base
- backup, the pg_stat_progress_basebackup
+ backup, the server's pg_stat_progress_basebackup
view will report the progress of the backup.
See for details.
@@ -116,7 +116,7 @@ PostgreSQL documentation
The following command-line options control the location and format of the
- output.
+ output:
@@ -124,15 +124,15 @@ PostgreSQL documentation
- Directory to write the output to.
- pg_basebackup will create the directory and
- any parent directories if necessary. The directory may already exist,
- but it is an error if the directory already exists and is not empty.
+ Sets the target directory to write the output to.
+ pg_basebackup will create this directory
+ (and any missing parent directories) if it does not exist. If it
+ already exists, it must be empty.
- When the backup is in tar mode, and the directory is specified as
- - (dash), the tar file will be written to
- stdout.
+ When the backup is in tar format, the target directory may be
+ specified as - (dash), causing the tar file to be
+ written to stdout.
This option is required.
@@ -155,12 +155,12 @@ PostgreSQL documentation
Write the output as plain files, with the same layout as the
- current data directory and tablespaces. When the cluster has
+ source server's data directory and tablespaces. When the cluster has
no additional tablespaces, the whole database will be placed in
the target directory. If the cluster contains additional
tablespaces, the main data directory will be placed in the
target directory, but all other tablespaces will be placed
- in the same absolute path as they have on the server.
+ in the same absolute path as they have on the source server.
This is the default format.
@@ -174,15 +174,15 @@ PostgreSQL documentation
Write the output as tar files in the target directory. The main
- data directory will be written to a file named
- base.tar, and all other tablespaces will
- be named after the tablespace OID.
-
+ data directory's contents will be written to a file named
+ base.tar, and each other tablespace will be
+ written to a separate tar file named after that tablespace's OID.
+
- If the value - (dash) is specified as
- target directory, the tar contents will be written to
- standard output, suitable for piping to for example
- gzip. This is only possible if
+ If the target directory is specified as -
+ (dash), the tar contents will be written to
+ standard output, suitable for piping to (for example)
+ gzip. This is only allowed if
the cluster has no additional tablespaces and WAL
streaming is not used.
@@ -192,40 +192,22 @@ PostgreSQL documentation
-
-
-
-
-
- The maximum transfer rate of data transferred from the server. Values are
- in kilobytes per second. Use a suffix of M to indicate megabytes
- per second. A suffix of k is also accepted, and has no effect.
- Valid values are between 32 kilobytes per second and 1024 megabytes per second.
-
-
- The purpose is to limit the impact of pg_basebackup
- on the running server.
-
-
- This option always affects transfer of the data directory. Transfer of
- WAL files is only affected if the collection method is fetch.
-
-
-
-
- Create standby.signal and append connection settings
- to postgresql.auto.conf in the output
- directory (or into the base archive file when using tar format) to
- ease setting up a standby server.
+ Creates a standby.signal file and appends
+ connection settings to the postgresql.auto.conf
+ file in the target directory (or within the base archive file when
+ using tar format). This eases setting up a standby server using the
+ results of the backup.
+
+
The postgresql.auto.conf file will record the connection
settings and, if specified, the replication slot
- that pg_basebackup is using, so that the
+ that pg_basebackup is using, so that
streaming replication will use the same settings later on.
@@ -237,17 +219,21 @@ PostgreSQL documentation
- Relocate the tablespace in directory olddir
+ Relocates the tablespace in directory olddir
to newdir during the backup. To be
effective, olddir must exactly match the
- path specification of the tablespace as it is currently defined. (But
- it is not an error if there is no tablespace
- in olddir contained in the backup.)
+ path specification of the tablespace as it is defined on the source
+ server. (But it is not an error if there is no tablespace
+ in olddir on the source server.)
+ Meanwhile newdir is a directory in the
+ receiving host's filesystem. As with the main target directory,
+ newdir need not exist already, but if
+ it does exist it must be empty.
Both olddir
- and newdir must be absolute paths. If a
- path happens to contain a = sign, escape it with a
- backslash. This option can be specified multiple times for multiple
- tablespaces. See examples below.
+ and newdir must be absolute paths. If
+ either path needs to contain an equal sign (=),
+ precede that with a backslash. This option can be specified multiple
+ times for multiple tablespaces.
@@ -263,10 +249,16 @@ PostgreSQL documentation
- Specifies the location for the write-ahead log directory.
+ Sets the directory to write WAL (write-ahead log) files to.
+ By default WAL files will be placed in
+ the pg_wal subdirectory of the target
+ directory, but this option can be used to place them elsewhere.
waldir must be an absolute path.
- The write-ahead log directory can only be specified when
- the backup is in plain mode.
+ As with the main target directory,
+ waldir need not exist already, but if
+ it does exist it must be empty.
+ This option can only be specified when
+ the backup is in plain format.
@@ -276,16 +268,16 @@ PostgreSQL documentation
- Includes the required write-ahead log files (WAL files) in the
+ Includes the required WAL (write-ahead log) files in the
backup. This will include all write-ahead logs generated during
the backup. Unless the method none is specified,
- it is possible to start a postmaster directly in the extracted
+ it is possible to start a postmaster in the target
directory without the need to consult the log archive, thus
- making this a completely standalone backup.
+ making the output a completely standalone backup.
- The following methods for collecting the write-ahead logs are
- supported:
+ The following methods for collecting the
+ write-ahead logs are supported:
@@ -293,7 +285,7 @@ PostgreSQL documentation
none
- Don't include write-ahead log in the backup.
+ Don't include write-ahead logs in the backup.
@@ -304,15 +296,16 @@ PostgreSQL documentation
The write-ahead log files are collected at the end of the backup.
- Therefore, it is necessary for the
+ Therefore, it is necessary for the source server's
parameter to be set high
- enough that the log is not removed before the end of the backup.
- If the log has been rotated when it's time to transfer it, the
- backup will fail and be unusable.
+ enough that the required log data is not removed before the end
+ of the backup. If the required log data has been recycled
+ before it's time to transfer it, the backup will fail and be
+ unusable.
- When tar format mode is used, the write-ahead log files will be
- written to the base.tar file.
+ When tar format is used, the write-ahead log files will be
+ included in the base.tar file.
@@ -322,16 +315,16 @@ PostgreSQL documentation
stream
- Stream the write-ahead log while the backup is created. This will
- open a second connection to the server and start streaming the
- write-ahead log in parallel while running the backup. Therefore,
- it will use up two connections configured by the
- parameter. As long as the
- client can keep up with write-ahead log received, using this mode
- requires no extra write-ahead logs to be saved on the primary.
+ Stream write-ahead log data while the backup is being taken.
+ This method will open a second connection to the server and
+ start streaming the write-ahead log in parallel while running
+ the backup. Therefore, it will require two replication
+ connections not just one. As long as the client can keep up
+ with the write-ahead log data, using this method requires no
+ extra write-ahead logs to be saved on the source server.
- When tar format mode is used, the write-ahead log files will be
+ When tar format is used, the write-ahead log files will be
written to a separate file named pg_wal.tar
(if the server is a version earlier than 10, the file will be named
pg_xlog.tar).
@@ -375,7 +368,7 @@ PostgreSQL documentation
The following command-line options control the generation of the
- backup and the running of the program.
+ backup and the running of the program:
@@ -383,7 +376,8 @@ PostgreSQL documentation
- Sets checkpoint mode to fast (immediate) or spread (default) (see ).
+ Sets checkpoint mode to fast (immediate) or spread (the default)
+ (see ).
@@ -393,9 +387,9 @@ PostgreSQL documentation
- This option causes creation of a replication slot named by the
- --slot option before starting the backup.
- An error is raised if the slot already exists.
+ Specifies that the replication slot named by the
+ --slot option should be created before starting
+ the backup. An error is raised if the slot already exists.
@@ -418,9 +412,9 @@ PostgreSQL documentation
By default, when pg_basebackup aborts with an
error, it removes any directories it might have created before
- discovering that it cannot finish the job (for example, data directory
- and write-ahead log directory). This option inhibits tidying-up and is
- thus useful for debugging.
+ discovering that it cannot finish the job (for example, the target
+ directory and write-ahead log directory). This option inhibits
+ tidying-up and is thus useful for debugging.
@@ -460,19 +454,41 @@ PostgreSQL documentation
+
+
+
+
+
+ Sets the maximum transfer rate at which data is collected from the
+ source server. This can be useful to limit the impact
+ of pg_basebackup on the server. Values
+ are in kilobytes per second. Use a suffix of M
+ to indicate megabytes per second. A suffix of k
+ is also accepted, and has no effect. Valid values are between 32
+ kilobytes per second and 1024 megabytes per second.
+
+
+ This option always affects transfer of the data directory. Transfer of
+ WAL files is only affected if the collection method
+ is fetch.
+
+
+
+
This option can only be used together with -X
- stream. It causes the WAL streaming to use the specified
+ stream. It causes WAL streaming to use the specified
replication slot. If the base backup is intended to be used as a
- streaming replication standby using replication slots, it should then
- use the same replication slot name
- in . That way, it is ensured that
- the server does not remove any necessary WAL data in the time between
- the end of the base backup and the start of streaming replication.
+ streaming-replication standby using a replication slot, the standby
+ should then use the same replication slot name as
+ . This ensures that the
+ primary server does not remove any necessary WAL data in the time
+ between the end of the base backup and the start of streaming
+ replication on the new standby.
The specified replication slot has to exist unless the
@@ -522,15 +538,15 @@ PostgreSQL documentation
Using a SHA hash function provides a cryptographically secure digest
of each file for users who wish to verify that the backup has not been
- tampered with, while the CRC32C algorithm provides a checksum which is
- much faster to calculate and good at catching errors due to accidental
+ tampered with, while the CRC32C algorithm provides a checksum that is
+ much faster to calculate; it is good at catching errors due to accidental
changes but is not resistant to targeted modifications. Note that, to
be useful against an adversary who has access to the backup, the backup
manifest would need to be stored securely elsewhere or otherwise
verified not to have been modified since the backup was taken.
- can be used to check the
+ can be used to check the
integrity of a backup against the backup manifest.
@@ -552,11 +568,11 @@ PostgreSQL documentation
- This option prevents the server from estimating the total
+ Prevents the server from estimating the total
amount of backup data that will be streamed, resulting in the
- backup_total column in the
- pg_stat_progress_basebackup
- to be NULL.
+ backup_total column in the
+ pg_stat_progress_basebackup view
+ always being NULL.
Without this option, the backup will start by enumerating
@@ -578,7 +594,7 @@ PostgreSQL documentation
Disables generation of a backup manifest. If this option is not
specified, the server will generate and send a backup manifest
- which can be verified using .
+ which can be verified using .
The manifest is a list of every file present in the backup with the
exception of any WAL files that may be included. It also stores the
size, last modification time, and an optional checksum for each file.
@@ -590,16 +606,17 @@ PostgreSQL documentation
- This option prevents the creation of a temporary replication slot
- during the backup even if it's supported by the server.
+ Prevents the creation of a temporary replication slot
+ for the backup.
- Temporary replication slots are created by default if no slot name
- is given with the option when using log streaming.
+ By default, if log streaming is selected but no slot name is given
+ with the option, then a temporary replication
+ slot is created (if supported by the source server).
The main purpose of this option is to allow taking a base backup when
- the server is out of free replication slots. Using replication slots
+ the server has no free replication slots. Using a replication slot
is almost always preferred, because it prevents needed WAL from being
removed by the server during the backup.
@@ -617,7 +634,7 @@ PostgreSQL documentation
By default, checksums are verified and checksum failures will result
in a non-zero exit status. However, the base backup will not be
removed in such a case, as if the option
- had been used. Checksum verifications failures will also be reported
+ had been used. Checksum verification failures will also be reported
in the
pg_stat_database view.
@@ -627,7 +644,8 @@ PostgreSQL documentation
- The following command-line options control the database connection parameters.
+ The following command-line options control the connection to the source
+ server:
@@ -641,7 +659,7 @@ PostgreSQL documentation
The option is called --dbname for consistency with other
client applications, but because pg_basebackup
- doesn't connect to any particular database in the cluster, database
+ doesn't connect to any particular database in the cluster, any database
name in the connection string will be ignored.
@@ -654,7 +672,7 @@ PostgreSQL documentation
Specifies the host name of the machine on which the server is
running. If the value begins with a slash, it is used as the
- directory for the Unix domain socket. The default is taken
+ directory for a Unix domain socket. The default is taken
from the PGHOST environment variable, if set,
else a Unix domain socket connection is attempted.
@@ -679,11 +697,12 @@ PostgreSQL documentation
- Specifies the number of seconds between status packets sent back to the
- server. This allows for easier monitoring of the progress from server.
- A value of zero disables the periodic status updates completely,
+ Specifies the number of seconds between status packets sent back to
+ the source server. Smaller values allow more accurate monitoring of
+ backup progress from the server.
+ A value of zero disables periodic status updates completely,
although an update will still be sent when requested by the server, to
- avoid timeout disconnect. The default value is 10 seconds.
+ avoid timeout-based disconnects. The default value is 10 seconds.
@@ -693,7 +712,7 @@ PostgreSQL documentation
- User name to connect as.
+ Specifies the user name to connect as.
@@ -703,7 +722,7 @@ PostgreSQL documentation
- Never issue a password prompt. If the server requires
+ Prevents issuing a password prompt. If the server requires
password authentication and a password is not available by
other means such as a .pgpass file, the
connection attempt will fail. This option can be useful in
@@ -718,8 +737,8 @@ PostgreSQL documentation
- Force pg_basebackup to prompt for a
- password before connecting to a database.
+ Forces pg_basebackup to prompt for a
+ password before connecting to the source server.
@@ -745,7 +764,7 @@ PostgreSQL documentation
- Print the pg_basebackup version and exit.
+ Prints the pg_basebackup version and exits.
@@ -755,8 +774,8 @@ PostgreSQL documentation
- Show help about pg_basebackup command line
- arguments, and exit.
+ Shows help about pg_basebackup command line
+ arguments, and exits.
@@ -787,11 +806,10 @@ PostgreSQL documentation
Notes
- At the beginning of the backup, a checkpoint needs to be written on the
- server the backup is taken from. Especially if the option
- --checkpoint=fast is not used, this can take some time
- during which pg_basebackup will be appear
- to be idle.
+ At the beginning of the backup, a checkpoint needs to be performed on the
+ source server. This can take some time (especially if the option
+ --checkpoint=fast is not used), during
+ which pg_basebackup will appear to be idle.
@@ -806,8 +824,8 @@ PostgreSQL documentation
- Tablespaces will in plain format by default be backed up to the same path
- they have on the server, unless the
+ In plain format, tablespaces will be backed up to the same path
+ they have on the source server, unless the
option --tablespace-mapping is used. Without
this option, running a plain format base backup on the same host as the
server will not work if tablespaces are in use, because the backup would
@@ -816,8 +834,9 @@ PostgreSQL documentation
- When tar format mode is used, it is the user's responsibility to unpack each
- tar file before starting the PostgreSQL server. If there are additional tablespaces, the
+ When tar format is used, it is the user's responsibility to unpack each
+ tar file before starting a PostgreSQL server that uses the data. If there
+ are additional tablespaces, the
tar files for them need to be unpacked in the correct locations. In this
case the symbolic links for those tablespaces will be created by the server
according to the contents of the tablespace_map file that is
@@ -827,15 +846,14 @@ PostgreSQL documentation
pg_basebackup works with servers of the same
or an older major version, down to 9.1. However, WAL streaming mode (-X
- stream) only works with server version 9.3 and later, and tar format mode
- (--format=tar) of the current version only works with server version 9.5
- or later.
+ stream) only works with server version 9.3 and later, and tar format
+ (--format=tar) only works with server version 9.5
+ and later.
- pg_basebackup will preserve group permissions in
- both the plain and tar formats if group
- permissions are enabled on the source cluster.
+ pg_basebackup will preserve group permissions
+ for data files if group permissions are enabled on the source cluster.
diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
index aac5d5be23f4f..c16f223e4edb4 100644
--- a/doc/src/sgml/ref/reindex.sgml
+++ b/doc/src/sgml/ref/reindex.sgml
@@ -307,7 +307,7 @@ REINDEX [ ( option [, ...] ) ] { IN
- A new temporary index definition is added to the catalog
+ A new transient index definition is added to the catalog
pg_index. This definition will be used to replace
the old index. A SHARE UPDATE EXCLUSIVE lock at
session level is taken on the indexes being reindexed as well as their
@@ -383,13 +383,15 @@ Indexes:
"idx_ccnew" btree (col) INVALID
- The recommended recovery method in such cases is to drop the invalid index
- and try again to perform REINDEX CONCURRENTLY. The
- concurrent index created during the processing has a name ending in the
- suffix ccnew, or ccold if it is an
- old index definition which we failed to drop. Invalid indexes can be
- dropped using DROP INDEX, including invalid toast
- indexes.
+ If the index marked INVALID is suffixed
+ ccnew, then it corresponds to the transient
+ index created during the concurrent operation, and the recommended
+ recovery method is to drop it using DROP INDEX,
+ then attempt REINDEX CONCURRENTLY again.
+ If the invalid index is instead suffixed ccold,
+ it corresponds to the original index which could not be dropped;
+ the recommended recovery method is to just drop said index, since the
+ rebuild proper has been successful.
diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index 5d6e893d49185..68d09951d9fc8 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -64,142 +64,116 @@
@@ -897,7 +871,7 @@ LANGUAGE C STRICT;
- The function is passed a pointer to a local_relopts
+ The function is passed a pointer to a local_relopts
struct, which needs to be filled with a set of operator class
specific options. The options can be accessed from other support
functions using the PG_HAS_OPCLASS_OPTIONS() and
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index 2f993ca2e037c..b0ae5d2e127e1 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -977,27 +977,8 @@ CAST ( 'string' AS type )
Most operators have the same precedence and are left-associative.
The precedence and associativity of the operators is hard-wired
into the parser.
-
-
-
- You will
- sometimes need to add parentheses when using combinations of
- binary and unary operators. For instance:
-
-SELECT 5 ! - 6;
-
- will be parsed as:
-
-SELECT 5 ! (- 6);
-
- because the parser has no idea — until it is too late
- — that ! is defined as a postfix operator,
- not an infix one. To get the desired behavior in this case, you
- must write:
-
-SELECT (5 !) - 6;
-
- This is the price one pays for extensibility.
+ Add parentheses if you want an expression with multiple operators
+ to be parsed in some other way than what the precedence rules imply.
@@ -1378,7 +1359,7 @@ CREATE FUNCTION dept(text) RETURNS dept
(Here, the brackets [ ] are meant to appear literally.)
Each subscript is itself an expression,
- which must yield an integer value.
+ which will be rounded to the nearest integer value.
diff --git a/doc/src/sgml/typeconv.sgml b/doc/src/sgml/typeconv.sgml
index 81dba7dacfed5..98662fc91fb6d 100644
--- a/doc/src/sgml/typeconv.sgml
+++ b/doc/src/sgml/typeconv.sgml
@@ -354,20 +354,19 @@ Some examples follow.
-Factorial Operator Type Resolution
+Square Root Operator Type Resolution
-There is only one factorial operator (postfix !)
+There is only one square root operator (prefix |/)
defined in the standard catalog, and it takes an argument of type
-bigint.
+double precision.
The scanner assigns an initial type of integer to the argument
in this query expression:
-SELECT 40 ! AS "40 factorial";
-
- 40 factorial
---------------------------------------------------
- 815915283247897734345611269596115894272000000000
+SELECT |/ 40 AS "square root of 40";
+ square root of 40
+-------------------
+ 6.324555320336759
(1 row)
@@ -375,7 +374,7 @@ So the parser does a type conversion on the operand and the query
is equivalent to:
-SELECT CAST(40 AS bigint) ! AS "40 factorial";
+SELECT |/ CAST(40 AS double precision) AS "square root of 40";
@@ -1069,7 +1068,7 @@ domain's base type for all subsequent steps.
functions, this behavior allows a domain type to be preserved through
a UNION or similar construct, so long as the user is
careful to ensure that all inputs are implicitly or explicitly of that
- exact type. Otherwise the domain's base type will be preferred.
+ exact type. Otherwise the domain's base type will be used.
@@ -1092,24 +1091,29 @@ If the non-unknown inputs are not all of the same type category, fail.
-Choose the first non-unknown input type which is a preferred type in
-that category, if there is one.
-
-
-
-
-
-Otherwise, choose the last non-unknown input type that allows all the
-preceding non-unknown inputs to be implicitly converted to it. (There
-always is such a type, since at least the first type in the list must
-satisfy this condition.)
+Select the first non-unknown input type as the candidate type,
+then consider each other non-unknown input type, left to right.
+
+
+ For historical reasons, CASE treats
+ its ELSE clause (if any) as the first
+ input, with the THEN clauses(s) considered after
+ that. In all other cases, left to right means the order
+ in which the expressions appear in the query text.
+
+
+If the candidate type can be implicitly converted to the other type,
+but not vice-versa, select the other type as the new candidate type.
+Then continue considering the remaining inputs. If, at any stage of this
+process, a preferred type is selected, stop considering additional
+inputs.
-Convert all inputs to the selected type. Fail if there is not a
-conversion from a given input to the selected type.
+Convert all inputs to the final candidate type. Fail if there is not an
+implicit conversion from a given input type to the candidate type.
diff --git a/doc/src/sgml/xfunc.sgml b/doc/src/sgml/xfunc.sgml
index 6de464c654577..732d93552127e 100644
--- a/doc/src/sgml/xfunc.sgml
+++ b/doc/src/sgml/xfunc.sgml
@@ -84,8 +84,11 @@
A procedure is a database object similar to a function. The difference is
that a procedure does not return a value, so there is no return type
declaration. While a function is called as part of a query or DML
- command, a procedure is called explicitly using
- the statement.
+ command, a procedure is called in isolation using
+ the command. If the CALL command is not
+ part of an explicit transaction, a procedure in many server-side
+ languages can commit, rollback, and begin new transactions during
+ its execution, which is not possible in functions.
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 7bdcbc858e39f..2cfccdedcf59f 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -264,24 +264,28 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
/* Search forward to re-find idatum */
for (;;)
{
- Datum newDatum;
- GinNullCategory newCategory;
-
if (moveRightIfItNeeded(btree, stack, snapshot) == false)
- elog(ERROR, "lost saved point in index"); /* must not happen !!! */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to re-find tuple within index \"%s\"",
+ RelationGetRelationName(btree->index))));
page = BufferGetPage(stack->buffer);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
- if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
- elog(ERROR, "lost saved point in index"); /* must not happen !!! */
- newDatum = gintuple_get_key(btree->ginstate, itup,
- &newCategory);
+ if (gintuple_get_attrnum(btree->ginstate, itup) == attnum)
+ {
+ Datum newDatum;
+ GinNullCategory newCategory;
+
+ newDatum = gintuple_get_key(btree->ginstate, itup,
+ &newCategory);
- if (ginCompareEntries(btree->ginstate, attnum,
- newDatum, newCategory,
- idatum, icategory) == 0)
- break; /* Found! */
+ if (ginCompareEntries(btree->ginstate, attnum,
+ newDatum, newCategory,
+ idatum, icategory) == 0)
+ break; /* Found! */
+ }
stack->off++;
}
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 9cd6638df6210..0935a6d9e53d6 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -727,7 +727,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
* entries. This is bogus if the index is partial, but it's real hard to
* tell how many distinct heap entries are referenced by a GIN index.
*/
- stats->num_index_tuples = info->num_heap_tuples;
+ stats->num_index_tuples = Max(info->num_heap_tuples, 0);
stats->estimated_count = info->estimated_count;
/*
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f75e1cf0e7b0f..9b5f417eac442 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6920,8 +6920,6 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
* updated/deleted by the inserting transaction.
*
* Look for a committed hint bit, or if no xmin bit is set, check clog.
- * This needs to work on both primary and standby, where it is used to
- * assess btree delete records.
*/
if (HeapTupleHeaderXminCommitted(tuple) ||
(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 3ad4222cb8aff..bc510e2e9b36c 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -188,7 +188,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
/* OK to prune */
(void) heap_page_prune(relation, buffer, vistest,
limited_xmin, limited_ts,
- true, &ignore);
+ true, &ignore, NULL);
}
/* And release buffer lock */
@@ -213,6 +213,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
* send its own new total to pgstats, and we don't want this delta applied
* on top of that.)
*
+ * off_loc is the offset location required by the caller to use in error
+ * callback.
+ *
* Returns the number of tuples deleted from the page and sets
* latestRemovedXid.
*/
@@ -221,7 +224,8 @@ heap_page_prune(Relation relation, Buffer buffer,
GlobalVisState *vistest,
TransactionId old_snap_xmin,
TimestampTz old_snap_ts,
- bool report_stats, TransactionId *latestRemovedXid)
+ bool report_stats, TransactionId *latestRemovedXid,
+ OffsetNumber *off_loc)
{
int ndeleted = 0;
Page page = BufferGetPage(buffer);
@@ -262,6 +266,13 @@ heap_page_prune(Relation relation, Buffer buffer,
if (prstate.marked[offnum])
continue;
+ /*
+ * Set the offset number so that we can display it along with any
+ * error that occurred while processing this tuple.
+ */
+ if (off_loc)
+ *off_loc = offnum;
+
/* Nothing to do if slot is empty or already dead */
itemid = PageGetItemId(page, offnum);
if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
@@ -271,6 +282,10 @@ heap_page_prune(Relation relation, Buffer buffer,
ndeleted += heap_prune_chain(buffer, offnum, &prstate);
}
+ /* Clear the offset information once we have processed the given page. */
+ if (off_loc)
+ *off_loc = InvalidOffsetNumber;
+
/* Any error while applying the changes is critical */
START_CRIT_SECTION();
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 44e2224dd557b..53b1a952543b7 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -208,7 +208,8 @@ typedef struct LVShared
* live tuples in the index vacuum case or the new live tuples in the
* index cleanup case.
*
- * estimated_count is true if reltuples is an estimated value.
+ * estimated_count is true if reltuples is an estimated value. (Note that
+ * reltuples could be -1 in this case, indicating we have no idea.)
*/
double reltuples;
bool estimated_count;
@@ -316,6 +317,7 @@ typedef struct LVRelStats
/* Used for error callback */
char *indname;
BlockNumber blkno; /* used only for heap operations */
+ OffsetNumber offnum; /* used only for heap operations */
VacErrPhase phase;
} LVRelStats;
@@ -323,6 +325,7 @@ typedef struct LVRelStats
typedef struct LVSavedErrInfo
{
BlockNumber blkno;
+ OffsetNumber offnum;
VacErrPhase phase;
} LVSavedErrInfo;
@@ -341,7 +344,8 @@ static void lazy_scan_heap(Relation onerel, VacuumParams *params,
LVRelStats *vacrelstats, Relation *Irel, int nindexes,
bool aggressive);
static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
-static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
+static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
+ LVRelStats *vacrelstats);
static void lazy_vacuum_all_indexes(Relation onerel, Relation *Irel,
IndexBulkDeleteResult **stats,
LVRelStats *vacrelstats, LVParallelState *lps,
@@ -364,6 +368,7 @@ static void lazy_record_dead_tuple(LVDeadTuples *dead_tuples,
static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
static int vac_cmp_itemptr(const void *left, const void *right);
static bool heap_page_is_all_visible(Relation rel, Buffer buf,
+ LVRelStats *vacrelstats,
TransactionId *visibility_cutoff_xid, bool *all_frozen);
static void lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
LVRelStats *vacrelstats, LVParallelState *lps,
@@ -396,7 +401,8 @@ static LVSharedIndStats *get_indstats(LVShared *lvshared, int n);
static bool skip_parallel_vacuum_index(Relation indrel, LVShared *lvshared);
static void vacuum_error_callback(void *arg);
static void update_vacuum_error_info(LVRelStats *errinfo, LVSavedErrInfo *saved_err_info,
- int phase, BlockNumber blkno);
+ int phase, BlockNumber blkno,
+ OffsetNumber offnum);
static void restore_vacuum_error_info(LVRelStats *errinfo, const LVSavedErrInfo *saved_err_info);
@@ -547,7 +553,8 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
* revert to the previous phase.
*/
update_vacuum_error_info(vacrelstats, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
- vacrelstats->nonempty_pages);
+ vacrelstats->nonempty_pages,
+ InvalidOffsetNumber);
lazy_truncate_heap(onerel, vacrelstats);
}
@@ -561,31 +568,19 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
/*
* Update statistics in pg_class.
*
- * A corner case here is that if we scanned no pages at all because every
- * page is all-visible, we should not update relpages/reltuples, because
- * we have no new information to contribute. In particular this keeps us
- * from replacing relpages=reltuples=0 (which means "unknown tuple
- * density") with nonzero relpages and reltuples=0 (which means "zero
- * tuple density") unless there's some actual evidence for the latter.
+ * In principle new_live_tuples could be -1 indicating that we (still)
+ * don't know the tuple count. In practice that probably can't happen,
+ * since we'd surely have scanned some pages if the table is new and
+ * nonempty.
*
- * It's important that we use tupcount_pages and not scanned_pages for the
- * check described above; scanned_pages counts pages where we could not
- * get cleanup lock, and which were processed only for frozenxid purposes.
- *
- * We do update relallvisible even in the corner case, since if the table
- * is all-visible we'd definitely like to know that. But clamp the value
- * to be not more than what we're setting relpages to.
+ * For safety, clamp relallvisible to be not more than what we're setting
+ * relpages to.
*
* Also, don't change relfrozenxid/relminmxid if we skipped any pages,
* since then we don't know for certain that all tuples have a newer xmin.
*/
new_rel_pages = vacrelstats->rel_pages;
new_live_tuples = vacrelstats->new_live_tuples;
- if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0)
- {
- new_rel_pages = vacrelstats->old_rel_pages;
- new_live_tuples = vacrelstats->old_live_tuples;
- }
visibilitymap_count(onerel, &new_rel_allvisible, NULL);
if (new_rel_allvisible > new_rel_pages)
@@ -606,7 +601,7 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
/* report results to the stats collector, too */
pgstat_report_vacuum(RelationGetRelid(onerel),
onerel->rd_rel->relisshared,
- new_live_tuples,
+ Max(new_live_tuples, 0),
vacrelstats->new_dead_tuples);
pgstat_progress_end_command();
@@ -960,7 +955,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
update_vacuum_error_info(vacrelstats, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
- blkno);
+ blkno, InvalidOffsetNumber);
if (blkno == next_unskippable_block)
{
@@ -1129,7 +1124,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
* to use lazy_check_needs_freeze() for both situations, though.
*/
LockBuffer(buf, BUFFER_LOCK_SHARE);
- if (!lazy_check_needs_freeze(buf, &hastup))
+ if (!lazy_check_needs_freeze(buf, &hastup, vacrelstats))
{
UnlockReleaseBuffer(buf);
vacrelstats->scanned_pages++;
@@ -1244,7 +1239,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
*/
tups_vacuumed += heap_page_prune(onerel, buf, vistest, false,
InvalidTransactionId, 0,
- &vacrelstats->latestRemovedXid);
+ &vacrelstats->latestRemovedXid,
+ &vacrelstats->offnum);
/*
* Now scan the page to collect vacuumable items and check for tuples
@@ -1267,6 +1263,11 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
{
ItemId itemid;
+ /*
+ * Set the offset number so that we can display it along with any
+ * error that occurred while processing this tuple.
+ */
+ vacrelstats->offnum = offnum;
itemid = PageGetItemId(page, offnum);
/* Unused items require no processing, but we count 'em */
@@ -1468,6 +1469,12 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
}
} /* scan along page */
+ /*
+ * Clear the offset information once we have processed all the tuples
+ * on the page.
+ */
+ vacrelstats->offnum = InvalidOffsetNumber;
+
/*
* If we froze any tuples, mark the buffer dirty, and write a WAL
* record recording the changes. We must log the changes to be
@@ -1662,6 +1669,9 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
/* report that everything is scanned and vacuumed */
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
+ /* Clear the block number information */
+ vacrelstats->blkno = InvalidBlockNumber;
+
pfree(frozen);
/* save stats for use later */
@@ -1674,9 +1684,12 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
vacrelstats->tupcount_pages,
live_tuples);
- /* also compute total number of surviving heap entries */
+ /*
+ * Also compute the total number of surviving heap entries. In the
+ * (unlikely) scenario that new_live_tuples is -1, take it as zero.
+ */
vacrelstats->new_rel_tuples =
- vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples;
+ Max(vacrelstats->new_live_tuples, 0) + vacrelstats->new_dead_tuples;
/*
* Release any remaining pin on visibility map page.
@@ -1842,7 +1855,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
/* Update error traceback information */
update_vacuum_error_info(vacrelstats, &saved_err_info, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
- InvalidBlockNumber);
+ InvalidBlockNumber, InvalidOffsetNumber);
pg_rusage_init(&ru0);
npages = 0;
@@ -1879,6 +1892,9 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
npages++;
}
+ /* Clear the block number information */
+ vacrelstats->blkno = InvalidBlockNumber;
+
if (BufferIsValid(vmbuffer))
{
ReleaseBuffer(vmbuffer);
@@ -1921,7 +1937,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
/* Update error traceback information */
update_vacuum_error_info(vacrelstats, &saved_err_info, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
- blkno);
+ blkno, InvalidOffsetNumber);
START_CRIT_SECTION();
@@ -1973,7 +1989,8 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
* dirty, exclusively locked, and, if needed, a full page image has been
* emitted in the log_heap_clean() above.
*/
- if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
+ if (heap_page_is_all_visible(onerel, buffer, vacrelstats,
+ &visibility_cutoff_xid,
&all_frozen))
PageSetAllVisible(page);
@@ -2012,7 +2029,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
* Also returns a flag indicating whether page contains any tuples at all.
*/
static bool
-lazy_check_needs_freeze(Buffer buf, bool *hastup)
+lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelStats *vacrelstats)
{
Page page = BufferGetPage(buf);
OffsetNumber offnum,
@@ -2037,6 +2054,11 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup)
{
ItemId itemid;
+ /*
+ * Set the offset number so that we can display it along with any
+ * error that occurred while processing this tuple.
+ */
+ vacrelstats->offnum = offnum;
itemid = PageGetItemId(page, offnum);
/* this should match hastup test in count_nondeletable_pages() */
@@ -2051,10 +2073,13 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup)
if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
MultiXactCutoff, buf))
- return true;
+ break;
} /* scan along page */
- return false;
+ /* Clear the offset information once we have processed the given page. */
+ vacrelstats->offnum = InvalidOffsetNumber;
+
+ return (offnum <= maxoff);
}
/*
@@ -2401,7 +2426,7 @@ lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
* dead_tuples, and update running statistics.
*
* reltuples is the number of heap tuples to be passed to the
- * bulkdelete callback.
+ * bulkdelete callback. It's always assumed to be estimated.
*/
static void
lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
@@ -2432,7 +2457,7 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
vacrelstats->indname = pstrdup(RelationGetRelationName(indrel));
update_vacuum_error_info(vacrelstats, &saved_err_info,
VACUUM_ERRCB_PHASE_VACUUM_INDEX,
- InvalidBlockNumber);
+ InvalidBlockNumber, InvalidOffsetNumber);
/* Do bulk deletion */
*stats = index_bulk_delete(&ivinfo, *stats,
@@ -2492,34 +2517,34 @@ lazy_cleanup_index(Relation indrel,
vacrelstats->indname = pstrdup(RelationGetRelationName(indrel));
update_vacuum_error_info(vacrelstats, &saved_err_info,
VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
- InvalidBlockNumber);
+ InvalidBlockNumber, InvalidOffsetNumber);
*stats = index_vacuum_cleanup(&ivinfo, *stats);
- /* Revert back to the old phase information for error traceback */
+ if (*stats)
+ {
+ if (IsParallelWorker())
+ msg = gettext_noop("index \"%s\" now contains %.0f row versions in %u pages as reported by parallel vacuum worker");
+ else
+ msg = gettext_noop("index \"%s\" now contains %.0f row versions in %u pages");
+
+ ereport(elevel,
+ (errmsg(msg,
+ RelationGetRelationName(indrel),
+ (*stats)->num_index_tuples,
+ (*stats)->num_pages),
+ errdetail("%.0f index row versions were removed.\n"
+ "%u index pages have been deleted, %u are currently reusable.\n"
+ "%s.",
+ (*stats)->tuples_removed,
+ (*stats)->pages_deleted, (*stats)->pages_free,
+ pg_rusage_show(&ru0))));
+ }
+
+ /* Revert to the previous phase information for error traceback */
restore_vacuum_error_info(vacrelstats, &saved_err_info);
pfree(vacrelstats->indname);
vacrelstats->indname = NULL;
-
- if (!(*stats))
- return;
-
- if (IsParallelWorker())
- msg = gettext_noop("index \"%s\" now contains %.0f row versions in %u pages as reported by parallel vacuum worker");
- else
- msg = gettext_noop("index \"%s\" now contains %.0f row versions in %u pages");
-
- ereport(elevel,
- (errmsg(msg,
- RelationGetRelationName(indrel),
- (*stats)->num_index_tuples,
- (*stats)->num_pages),
- errdetail("%.0f index row versions were removed.\n"
- "%u index pages have been deleted, %u are currently reusable.\n"
- "%s.",
- (*stats)->tuples_removed,
- (*stats)->pages_deleted, (*stats)->pages_free,
- pg_rusage_show(&ru0))));
}
/*
@@ -2958,6 +2983,7 @@ vac_cmp_itemptr(const void *left, const void *right)
*/
static bool
heap_page_is_all_visible(Relation rel, Buffer buf,
+ LVRelStats *vacrelstats,
TransactionId *visibility_cutoff_xid,
bool *all_frozen)
{
@@ -2982,6 +3008,11 @@ heap_page_is_all_visible(Relation rel, Buffer buf,
ItemId itemid;
HeapTupleData tuple;
+ /*
+ * Set the offset number so that we can display it along with any
+ * error that occurred while processing this tuple.
+ */
+ vacrelstats->offnum = offnum;
itemid = PageGetItemId(page, offnum);
/* Unused or redirect line pointers are of no interest */
@@ -3059,6 +3090,9 @@ heap_page_is_all_visible(Relation rel, Buffer buf,
}
} /* scan along page */
+ /* Clear the offset information once we have processed the given page. */
+ vacrelstats->offnum = InvalidOffsetNumber;
+
return all_visible;
}
@@ -3580,14 +3614,32 @@ vacuum_error_callback(void *arg)
{
case VACUUM_ERRCB_PHASE_SCAN_HEAP:
if (BlockNumberIsValid(errinfo->blkno))
- errcontext("while scanning block %u of relation \"%s.%s\"",
- errinfo->blkno, errinfo->relnamespace, errinfo->relname);
+ {
+ if (OffsetNumberIsValid(errinfo->offnum))
+ errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
+ errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
+ else
+ errcontext("while scanning block %u of relation \"%s.%s\"",
+ errinfo->blkno, errinfo->relnamespace, errinfo->relname);
+ }
+ else
+ errcontext("while scanning relation \"%s.%s\"",
+ errinfo->relnamespace, errinfo->relname);
break;
case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
if (BlockNumberIsValid(errinfo->blkno))
- errcontext("while vacuuming block %u of relation \"%s.%s\"",
- errinfo->blkno, errinfo->relnamespace, errinfo->relname);
+ {
+ if (OffsetNumberIsValid(errinfo->offnum))
+ errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
+ errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
+ else
+ errcontext("while vacuuming block %u of relation \"%s.%s\"",
+ errinfo->blkno, errinfo->relnamespace, errinfo->relname);
+ }
+ else
+ errcontext("while vacuuming relation \"%s.%s\"",
+ errinfo->relnamespace, errinfo->relname);
break;
case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
@@ -3619,15 +3671,17 @@ vacuum_error_callback(void *arg)
*/
static void
update_vacuum_error_info(LVRelStats *errinfo, LVSavedErrInfo *saved_err_info, int phase,
- BlockNumber blkno)
+ BlockNumber blkno, OffsetNumber offnum)
{
if (saved_err_info)
{
+ saved_err_info->offnum = errinfo->offnum;
saved_err_info->blkno = errinfo->blkno;
saved_err_info->phase = errinfo->phase;
}
errinfo->blkno = blkno;
+ errinfo->offnum = offnum;
errinfo->phase = phase;
}
@@ -3638,5 +3692,6 @@ static void
restore_vacuum_error_info(LVRelStats *errinfo, const LVSavedErrInfo *saved_err_info)
{
errinfo->blkno = saved_err_info->blkno;
+ errinfo->offnum = saved_err_info->offnum;
errinfo->phase = saved_err_info->phase;
}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 8fa6ac7296b90..c822b49a71022 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -853,6 +853,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
if (cleanup_scale_factor <= 0 ||
+ info->num_heap_tuples < 0 ||
prev_num_heap_tuples <= 0 ||
(info->num_heap_tuples - prev_num_heap_tuples) /
prev_num_heap_tuples >= cleanup_scale_factor)
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index c638319765756..6438c457161ac 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -701,18 +701,14 @@ table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
* doesn't happen instantaneously, and it won't happen at all for cases
* such as temporary tables.)
*
- * We approximate "never vacuumed" by "has relpages = 0", which means this
- * will also fire on genuinely empty relations. Not great, but
- * fortunately that's a seldom-seen case in the real world, and it
- * shouldn't degrade the quality of the plan too much anyway to err in
- * this direction.
+ * We test "never vacuumed" by seeing whether reltuples < 0.
*
* If the table has inheritance children, we don't apply this heuristic.
* Totally empty parent tables are quite common, so we should be willing
* to believe that they are empty.
*/
if (curpages < 10 &&
- relpages == 0 &&
+ reltuples < 0 &&
!rel->rd_rel->relhassubclass)
curpages = 10;
@@ -727,17 +723,17 @@ table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
}
/* estimate number of tuples from previous tuple density */
- if (relpages > 0)
+ if (reltuples >= 0 && relpages > 0)
density = reltuples / (double) relpages;
else
{
/*
- * When we have no data because the relation was truncated, estimate
- * tuple width from attribute datatypes. We assume here that the
- * pages are completely full, which is OK for tables (since they've
- * presumably not been VACUUMed yet) but is probably an overestimate
- * for indexes. Fortunately get_relation_info() can clamp the
- * overestimate to the parent table's size.
+ * When we have no data because the relation was never yet vacuumed,
+ * estimate tuple width from attribute datatypes. We assume here that
+ * the pages are completely full, which is OK for tables but is
+ * probably an overestimate for indexes. Fortunately
+ * get_relation_info() can clamp the overestimate to the parent
+ * table's size.
*
* Note: this code intentionally disregards alignment considerations,
* because (a) that would be gilding the lily considering how crude
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index c5f09667ba159..1edc8180c1284 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -635,12 +635,11 @@ be reconstructed later following a crash and the action is simply a way
of optimising for performance. When a hint is written we use
MarkBufferDirtyHint() to mark the block dirty.
-If the buffer is clean and checksums are in use then
-MarkBufferDirtyHint() inserts an XLOG_FPI record to ensure that we
-take a full page image that includes the hint. We do this to avoid
-a partial page write, when we write the dirtied page. WAL is not
-written during recovery, so we simply skip dirtying blocks because
-of hints when in recovery.
+If the buffer is clean and checksums are in use then MarkBufferDirtyHint()
+inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image
+that includes the hint. We do this to avoid a partial page write, when we
+write the dirtied page. WAL is not written during recovery, so we simply skip
+dirtying blocks because of hints when in recovery.
If you do decide to optimise away a WAL record, then any calls to
MarkBufferDirty() must be replaced by MarkBufferDirtyHint(),
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index d1dbb43e096c1..7640f153c227b 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1191,6 +1191,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
/*
* Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint. Mutual exclusion must begin
+ * before computing cutoffPage. Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage. Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
*/
void
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index a087a5554210c..a50f60b99af28 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -349,8 +349,8 @@ ExtendSUBTRANS(TransactionId newestXact)
/*
* Remove all SUBTRANS segments before the one holding the passed transaction ID
*
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
+ * oldestXact is the oldest TransactionXmin of any running transaction. This
+ * is called only during checkpoint.
*/
void
TruncateSUBTRANS(TransactionId oldestXact)
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 2d2b05be36c47..a4944faa32e34 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -367,12 +367,13 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
* We'll refuse to continue assigning XIDs in interactive mode once we get
* within 3M transactions of data loss. This leaves lots of room for the
* DBA to fool around fixing things in a standalone backend, while not
- * being significant compared to total XID space. (Note that since
- * vacuuming requires one transaction per table cleaned, we had better be
- * sure there's lots of XIDs left...) Also, at default BLCKSZ, this
- * leaves two completely-idle segments. In the event of edge-case bugs
- * involving page or segment arithmetic, idle segments render the bugs
- * unreachable outside of single-user mode.
+ * being significant compared to total XID space. (VACUUM requires an XID
+ * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA
+ * might do by reflex, assigns an XID. Hence, we had better be sure
+ * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two
+ * completely-idle segments. In the event of edge-case bugs involving
+ * page or segment arithmetic, idle segments render the bugs unreachable
+ * outside of single-user mode.
*/
xidStopLimit = xidWrapLimit - 3000000;
if (xidStopLimit < FirstNormalTransactionId)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 7ccb7d68ed9a6..af6afcebb133f 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -5565,6 +5565,7 @@ XactLogCommitRecord(TimestampTz commit_time,
{
xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
xl_relfilenodes.nrels = nrels;
+ info |= XLR_SPECIAL_REL_UPDATE;
}
if (nmsgs > 0)
@@ -5697,6 +5698,7 @@ XactLogAbortRecord(TimestampTz abort_time,
{
xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
xl_relfilenodes.nrels = nrels;
+ info |= XLR_SPECIAL_REL_UPDATE;
}
if (TransactionIdIsValid(twophase_xid))
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index f2ca686397ebd..abd5bdb866b3a 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1015,7 +1015,7 @@ AddNewRelationTuple(Relation pg_class_desc,
case RELKIND_TOASTVALUE:
/* The relation is real, but as yet empty */
new_rel_reltup->relpages = 0;
- new_rel_reltup->reltuples = 0;
+ new_rel_reltup->reltuples = -1;
new_rel_reltup->relallvisible = 0;
break;
case RELKIND_SEQUENCE:
@@ -1027,7 +1027,7 @@ AddNewRelationTuple(Relation pg_class_desc,
default:
/* Views, etc, have no disk storage */
new_rel_reltup->relpages = 0;
- new_rel_reltup->reltuples = 0;
+ new_rel_reltup->reltuples = -1;
new_rel_reltup->relallvisible = 0;
break;
}
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 1be27eec52e6e..d0ec9a4b9c80e 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -1512,7 +1512,6 @@ index_concurrently_swap(Oid newIndexId, Oid oldIndexId, const char *oldName)
/* Preserve indisreplident in the new index */
newIndexForm->indisreplident = oldIndexForm->indisreplident;
- oldIndexForm->indisreplident = false;
/* Preserve indisclustered in the new index */
newIndexForm->indisclustered = oldIndexForm->indisclustered;
@@ -1524,6 +1523,7 @@ index_concurrently_swap(Oid newIndexId, Oid oldIndexId, const char *oldName)
newIndexForm->indisvalid = true;
oldIndexForm->indisvalid = false;
oldIndexForm->indisclustered = false;
+ oldIndexForm->indisreplident = false;
CatalogTupleUpdate(pg_index, &oldIndexTuple->t_self, oldIndexTuple);
CatalogTupleUpdate(pg_index, &newIndexTuple->t_self, newIndexTuple);
@@ -2722,6 +2722,15 @@ index_update_stats(Relation rel,
/* Should this be a more comprehensive test? */
Assert(rd_rel->relkind != RELKIND_PARTITIONED_INDEX);
+ /*
+ * As a special hack, if we are dealing with an empty table and the
+ * existing reltuples is -1, we leave that alone. This ensures that
+ * creating an index as part of CREATE TABLE doesn't cause the table to
+ * prematurely look like it's been vacuumed.
+ */
+ if (reltuples == 0 && rd_rel->reltuples < 0)
+ reltuples = -1;
+
/* Apply required updates, if any, to copied tuple */
dirty = false;
@@ -3349,10 +3358,13 @@ index_set_state_flags(Oid indexId, IndexStateFlagsAction action)
* CONCURRENTLY that failed partway through.)
*
* Note: the CLUSTER logic assumes that indisclustered cannot be
- * set on any invalid index, so clear that flag too.
+ * set on any invalid index, so clear that flag too. Similarly,
+ * ALTER TABLE assumes that indisreplident cannot be set for
+ * invalid indexes.
*/
indexForm->indisvalid = false;
indexForm->indisclustered = false;
+ indexForm->indisreplident = false;
break;
case INDEX_DROP_SET_DEAD:
@@ -3364,6 +3376,8 @@ index_set_state_flags(Oid indexId, IndexStateFlagsAction action)
* the index at all.
*/
Assert(!indexForm->indisvalid);
+ Assert(!indexForm->indisclustered);
+ Assert(!indexForm->indisreplident);
indexForm->indisready = false;
indexForm->indislive = false;
break;
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 8625cbeab6e47..a2d61302f9e82 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -554,6 +554,12 @@ CREATE VIEW pg_shmem_allocations AS
REVOKE ALL ON pg_shmem_allocations FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
+CREATE VIEW pg_backend_memory_contexts AS
+ SELECT * FROM pg_get_backend_memory_contexts();
+
+REVOKE ALL ON pg_backend_memory_contexts FROM PUBLIC;
+REVOKE EXECUTE ON FUNCTION pg_get_backend_memory_contexts() FROM PUBLIC;
+
-- Statistics views
CREATE VIEW pg_stat_all_tables AS
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 71b7577afc067..774b26fd2c4d2 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -244,19 +244,22 @@ typedef struct QueueBackendStatus
/*
* Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
*
- * The AsyncQueueControl structure is protected by the NotifyQueueLock.
+ * The AsyncQueueControl structure is protected by the NotifyQueueLock and
+ * NotifyQueueTailLock.
*
- * When holding the lock in SHARED mode, backends may only inspect their own
- * entries as well as the head and tail pointers. Consequently we can allow a
- * backend to update its own record while holding only SHARED lock (since no
- * other backend will inspect it).
+ * When holding NotifyQueueLock in SHARED mode, backends may only inspect
+ * their own entries as well as the head and tail pointers. Consequently we
+ * can allow a backend to update its own record while holding only SHARED lock
+ * (since no other backend will inspect it).
*
- * When holding the lock in EXCLUSIVE mode, backends can inspect the entries
- * of other backends and also change the head and tail pointers.
+ * When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the
+ * entries of other backends and also change the head pointer. When holding
+ * both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends
+ * can change the tail pointer.
*
* NotifySLRULock is used as the control lock for the pg_notify SLRU buffers.
- * In order to avoid deadlocks, whenever we need both locks, we always first
- * get NotifyQueueLock and then NotifySLRULock.
+ * In order to avoid deadlocks, whenever we need multiple locks, we first get
+ * NotifyQueueTailLock, then NotifyQueueLock, and lastly NotifySLRULock.
*
* Each backend uses the backend[] array entry with index equal to its
* BackendId (which can range from 1 to MaxBackends). We rely on this to make
@@ -299,13 +302,10 @@ static SlruCtlData NotifyCtlData;
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
- * slru.c currently assumes that all filenames are four characters of hex
- * digits. That means that we can use segments 0000 through FFFF.
- * Each segment contains SLRU_PAGES_PER_SEGMENT pages which gives us
- * the pages from 0 to SLRU_PAGES_PER_SEGMENT * 0x10000 - 1.
- *
- * It's of course possible to enhance slru.c, but this gives us so much
- * space already that it doesn't seem worth the trouble.
+ * Use segments 0000 through FFFF. Each contains SLRU_PAGES_PER_SEGMENT pages
+ * which gives us the pages from 0 to SLRU_PAGES_PER_SEGMENT * 0x10000 - 1.
+ * We could use as many segments as SlruScanDirectory() allows, but this gives
+ * us so much space already that it doesn't seem worth the trouble.
*
* The most data we can have in the queue at a time is QUEUE_MAX_PAGE/2
* pages, because more than that would confuse slru.c into thinking there
@@ -2177,6 +2177,10 @@ asyncQueueAdvanceTail(void)
int newtailpage;
int boundary;
+ /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+ LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
+
+ /* Compute the new tail. */
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
min = QUEUE_HEAD;
for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i))
@@ -2185,7 +2189,6 @@ asyncQueueAdvanceTail(void)
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
}
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
- QUEUE_TAIL = min;
LWLockRelease(NotifyQueueLock);
/*
@@ -2205,6 +2208,17 @@ asyncQueueAdvanceTail(void)
*/
SimpleLruTruncate(NotifyCtl, newtailpage);
}
+
+ /*
+ * Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
+ * the segment immediately prior to the new tail, allowing fresh data into
+ * that segment.
+ */
+ LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
+ QUEUE_TAIL = min;
+ LWLockRelease(NotifyQueueLock);
+
+ LWLockRelease(NotifyQueueTailLock);
}
/*
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 30e0a7ee7f219..1ce37dc4e2815 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -116,7 +116,8 @@ static void show_instrumentation_count(const char *qlabel, int which,
static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
static void show_eval_params(Bitmapset *bms_params, ExplainState *es);
static const char *explain_get_index_name(Oid indexId);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage,
+ bool planning);
static void show_wal_usage(ExplainState *es, const WalUsage *usage);
static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
ExplainState *es);
@@ -184,6 +185,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
es->wal = defGetBoolean(opt);
else if (strcmp(opt->defname, "settings") == 0)
es->settings = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "usage") == 0)
+ es->usage = defGetBoolean(opt);
else if (strcmp(opt->defname, "timing") == 0)
{
timing_set = true;
@@ -221,11 +224,6 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
parser_errposition(pstate, opt->location)));
}
- if (es->buffers && !es->analyze)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("EXPLAIN option BUFFERS requires ANALYZE")));
-
if (es->wal && !es->analyze)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -312,6 +310,7 @@ NewExplainState(void)
/* Set default options (most fields can be left as zeroes). */
es->costs = true;
+ es->usage = true;
/* Prepare output buffer. */
es->str = makeStringInfo();
@@ -586,8 +585,13 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
/* Create textual dump of plan tree */
ExplainPrintPlan(es, queryDesc);
- if (es->summary && (planduration || bufusage))
+ /* Show buffer usage in planning */
+ if (bufusage)
+ {
ExplainOpenGroup("Planning", "Planning", true, es);
+ show_buffer_usage(es, bufusage, true);
+ ExplainCloseGroup("Planning", "Planning", true, es);
+ }
if (es->summary && planduration)
{
@@ -596,19 +600,6 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
ExplainPropertyFloat("Planning Time", "ms", 1000.0 * plantime, 3, es);
}
- /* Show buffer usage */
- if (es->summary && bufusage)
- {
- if (es->format == EXPLAIN_FORMAT_TEXT)
- es->indent++;
- show_buffer_usage(es, bufusage);
- if (es->format == EXPLAIN_FORMAT_TEXT)
- es->indent--;
- }
-
- if (es->summary && (planduration || bufusage))
- ExplainCloseGroup("Planning", "Planning", true, es);
-
/* Print info about runtime of triggers */
if (es->analyze)
ExplainPrintTriggers(es, queryDesc);
@@ -1996,7 +1987,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->bufusage);
+ show_buffer_usage(es, &planstate->instrument->bufusage, false);
if (es->wal && planstate->instrument)
show_wal_usage(es, &planstate->instrument->walusage);
@@ -2015,7 +2006,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->bufusage);
+ show_buffer_usage(es, &instrument->bufusage, false);
if (es->wal)
show_wal_usage(es, &instrument->walusage);
ExplainCloseWorker(n, es);
@@ -3023,22 +3014,50 @@ show_hash_info(HashState *hashstate, ExplainState *es)
else if (hinstrument.nbatch_original != hinstrument.nbatch ||
hinstrument.nbuckets_original != hinstrument.nbuckets)
{
+ ListCell *lc;
+
ExplainIndentText(es);
appendStringInfo(es->str,
- "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
+ "Buckets: %d (originally %d) Batches: %d (originally %d)",
hinstrument.nbuckets,
hinstrument.nbuckets_original,
hinstrument.nbatch,
- hinstrument.nbatch_original,
- spacePeakKb);
+ hinstrument.nbatch_original);
+ if (es->usage)
+ appendStringInfo(es->str, " Memory Usage: %ldkB\n", spacePeakKb);
+ else
+ appendStringInfo(es->str, "\n");
+
+ foreach(lc, hinstrument.fallback_batches_stats)
+ {
+ FallbackBatchStats *fbs = lfirst(lc);
+
+ ExplainIndentText(es);
+ appendStringInfo(es->str, "Batch: %d Stripes: %d\n", fbs->batchno, fbs->numstripes);
+ }
}
else
{
+ ListCell *lc;
+
ExplainIndentText(es);
appendStringInfo(es->str,
- "Buckets: %d Batches: %d Memory Usage: %ldkB\n",
- hinstrument.nbuckets, hinstrument.nbatch,
- spacePeakKb);
+ "Buckets: %d Batches: %d",
+ hinstrument.nbuckets, hinstrument.nbatch);
+ if (es->usage)
+ appendStringInfo(es->str, " Memory Usage: %ldkB\n", spacePeakKb);
+ else
+ appendStringInfo(es->str, "\n");
+ foreach(lc, hinstrument.fallback_batches_stats)
+ {
+ FallbackBatchStats *fbs = lfirst(lc);
+
+ ExplainIndentText(es);
+ appendStringInfo(es->str,
+ "Batch: %d Stripes: %d\n",
+ fbs->batchno,
+ fbs->numstripes);
+ }
}
}
}
@@ -3301,7 +3320,7 @@ explain_get_index_name(Oid indexId)
* Show buffer usage details.
*/
static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
{
if (es->format == EXPLAIN_FORMAT_TEXT)
{
@@ -3317,6 +3336,15 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
usage->temp_blks_written > 0);
bool has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
!INSTR_TIME_IS_ZERO(usage->blk_write_time));
+ bool show_planning = (planning && (has_shared ||
+ has_local || has_temp || has_timing));
+
+ if (show_planning)
+ {
+ ExplainIndentText(es);
+ appendStringInfoString(es->str, "Planning:\n");
+ es->indent++;
+ }
/* Show only positive counter values. */
if (has_shared || has_local || has_temp)
@@ -3386,6 +3414,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
appendStringInfoChar(es->str, '\n');
}
+
+ if (show_planning)
+ es->indent--;
}
else
{
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cd989c95e5174..d2b15a3387b0b 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -405,6 +405,8 @@ static bool ConstraintImpliedByRelConstraint(Relation scanrel,
List *testConstraint, List *provenConstraint);
static ObjectAddress ATExecColumnDefault(Relation rel, const char *colName,
Node *newDefault, LOCKMODE lockmode);
+static ObjectAddress ATExecCookedColumnDefault(Relation rel, AttrNumber attnum,
+ Node *newDefault);
static ObjectAddress ATExecAddIdentity(Relation rel, const char *colName,
Node *def, LOCKMODE lockmode);
static ObjectAddress ATExecSetIdentity(Relation rel, const char *colName,
@@ -2054,8 +2056,8 @@ storage_name(char c)
* 'schema' is the column/attribute definition for the table. (It's a list
* of ColumnDef's.) It is destructively changed.
* 'supers' is a list of OIDs of parent relations, already locked by caller.
- * 'relpersistence' is a persistence type of the table.
- * 'is_partition' tells if the table is a partition
+ * 'relpersistence' is the persistence type of the table.
+ * 'is_partition' tells if the table is a partition.
*
* Output arguments:
* 'supconstr' receives a list of constraints belonging to the parents,
@@ -2218,7 +2220,11 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
TupleDesc tupleDesc;
TupleConstr *constr;
AttrMap *newattmap;
+ List *inherited_defaults;
+ List *cols_with_defaults;
AttrNumber parent_attno;
+ ListCell *lc1;
+ ListCell *lc2;
/* caller already got lock */
relation = table_open(parent, NoLock);
@@ -2304,6 +2310,9 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
*/
newattmap = make_attrmap(tupleDesc->natts);
+ /* We can't process inherited defaults until newattmap is complete. */
+ inherited_defaults = cols_with_defaults = NIL;
+
for (parent_attno = 1; parent_attno <= tupleDesc->natts;
parent_attno++)
{
@@ -2359,7 +2368,7 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
get_collation_name(defCollId),
get_collation_name(attribute->attcollation))));
- /* Copy storage parameter */
+ /* Copy/check storage parameter */
if (def->storage == 0)
def->storage = attribute->attstorage;
else if (def->storage != attribute->attstorage)
@@ -2410,7 +2419,7 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
}
/*
- * Copy default if any
+ * Locate default if any
*/
if (attribute->atthasdef)
{
@@ -2432,23 +2441,59 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
Assert(this_default != NULL);
/*
- * If default expr could contain any vars, we'd need to fix
- * 'em, but it can't; so default is ready to apply to child.
- *
- * If we already had a default from some prior parent, check
- * to see if they are the same. If so, no problem; if not,
- * mark the column as having a bogus default. Below, we will
- * complain if the bogus default isn't overridden by the child
- * schema.
+ * If it's a GENERATED default, it might contain Vars that
+ * need to be mapped to the inherited column(s)' new numbers.
+ * We can't do that till newattmap is ready, so just remember
+ * all the inherited default expressions for the moment.
*/
- Assert(def->raw_default == NULL);
- if (def->cooked_default == NULL)
- def->cooked_default = this_default;
- else if (!equal(def->cooked_default, this_default))
- {
- def->cooked_default = &bogus_marker;
- have_bogus_defaults = true;
- }
+ inherited_defaults = lappend(inherited_defaults, this_default);
+ cols_with_defaults = lappend(cols_with_defaults, def);
+ }
+ }
+
+ /*
+ * Now process any inherited default expressions, adjusting attnos
+ * using the completed newattmap map.
+ */
+ forboth(lc1, inherited_defaults, lc2, cols_with_defaults)
+ {
+ Node *this_default = (Node *) lfirst(lc1);
+ ColumnDef *def = (ColumnDef *) lfirst(lc2);
+ bool found_whole_row;
+
+ /* Adjust Vars to match new table's column numbering */
+ this_default = map_variable_attnos(this_default,
+ 1, 0,
+ newattmap,
+ InvalidOid, &found_whole_row);
+
+ /*
+ * For the moment we have to reject whole-row variables. We could
+ * convert them, if we knew the new table's rowtype OID, but that
+ * hasn't been assigned yet. (A variable could only appear in a
+ * generation expression, so the error message is correct.)
+ */
+ if (found_whole_row)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot convert whole-row table reference"),
+ errdetail("Generation expression for column \"%s\" contains a whole-row reference to table \"%s\".",
+ def->colname,
+ RelationGetRelationName(relation))));
+
+ /*
+ * If we already had a default from some prior parent, check to
+ * see if they are the same. If so, no problem; if not, mark the
+ * column as having a bogus default. Below, we will complain if
+ * the bogus default isn't overridden by the child schema.
+ */
+ Assert(def->raw_default == NULL);
+ if (def->cooked_default == NULL)
+ def->cooked_default = this_default;
+ else if (!equal(def->cooked_default, this_default))
+ {
+ def->cooked_default = &bogus_marker;
+ have_bogus_defaults = true;
}
}
@@ -2667,7 +2712,6 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
def->raw_default = newdef->raw_default;
def->cooked_default = newdef->cooked_default;
}
-
}
else
{
@@ -3781,6 +3825,7 @@ AlterTableGetLockLevel(List *cmds)
* Theoretically, these could be ShareRowExclusiveLock.
*/
case AT_ColumnDefault:
+ case AT_CookedColumnDefault:
case AT_AlterConstraint:
case AT_AddIndex: /* from ADD CONSTRAINT */
case AT_AddIndexConstraint:
@@ -4040,6 +4085,13 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
/* No command-specific prep needed */
pass = cmd->def ? AT_PASS_ADD_OTHERCONSTR : AT_PASS_DROP;
break;
+ case AT_CookedColumnDefault: /* add a pre-cooked default */
+ /* This is currently used only in CREATE TABLE */
+ /* (so the permission check really isn't necessary) */
+ ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE);
+ /* This command never recurses */
+ pass = AT_PASS_ADD_OTHERCONSTR;
+ break;
case AT_AddIdentity:
ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE);
/* This command never recurses */
@@ -4398,6 +4450,9 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
case AT_ColumnDefault: /* ALTER COLUMN DEFAULT */
address = ATExecColumnDefault(rel, cmd->name, cmd->def, lockmode);
break;
+ case AT_CookedColumnDefault: /* add a pre-cooked default */
+ address = ATExecCookedColumnDefault(rel, cmd->num, cmd->def);
+ break;
case AT_AddIdentity:
cmd = ATParseTransformCmd(wqueue, tab, rel, cmd, false, lockmode,
cur_pass, context);
@@ -4458,9 +4513,12 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
lockmode);
break;
case AT_AddConstraint: /* ADD CONSTRAINT */
- cmd = ATParseTransformCmd(wqueue, tab, rel, cmd, false, lockmode,
- cur_pass, context);
- /* Might not have gotten AddConstraint back from parse transform */
+ /* Transform the command only during initial examination */
+ if (cur_pass == AT_PASS_ADD_CONSTR)
+ cmd = ATParseTransformCmd(wqueue, tab, rel, cmd,
+ false, lockmode,
+ cur_pass, context);
+ /* Depending on constraint type, might be no more work to do now */
if (cmd != NULL)
address =
ATExecAddConstraint(wqueue, tab, rel,
@@ -4468,9 +4526,12 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
false, false, lockmode);
break;
case AT_AddConstraintRecurse: /* ADD CONSTRAINT with recursion */
- cmd = ATParseTransformCmd(wqueue, tab, rel, cmd, true, lockmode,
- cur_pass, context);
- /* Might not have gotten AddConstraint back from parse transform */
+ /* Transform the command only during initial examination */
+ if (cur_pass == AT_PASS_ADD_CONSTR)
+ cmd = ATParseTransformCmd(wqueue, tab, rel, cmd,
+ true, lockmode,
+ cur_pass, context);
+ /* Depending on constraint type, might be no more work to do now */
if (cmd != NULL)
address =
ATExecAddConstraint(wqueue, tab, rel,
@@ -4732,75 +4793,88 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
foreach(lc, atstmt->cmds)
{
AlterTableCmd *cmd2 = lfirst_node(AlterTableCmd, lc);
+ int pass;
+
+ /*
+ * This switch need only cover the subcommand types that can be added
+ * by parse_utilcmd.c; otherwise, we'll use the default strategy of
+ * executing the subcommand immediately, as a substitute for the
+ * original subcommand. (Note, however, that this does cause
+ * AT_AddConstraint subcommands to be rescheduled into later passes,
+ * which is important for index and foreign key constraints.)
+ *
+ * We assume we needn't do any phase-1 checks for added subcommands.
+ */
+ switch (cmd2->subtype)
+ {
+ case AT_SetNotNull:
+ /* Need command-specific recursion decision */
+ ATPrepSetNotNull(wqueue, rel, cmd2,
+ recurse, false,
+ lockmode, context);
+ pass = AT_PASS_COL_ATTRS;
+ break;
+ case AT_AddIndex:
+ /* This command never recurses */
+ /* No command-specific prep needed */
+ pass = AT_PASS_ADD_INDEX;
+ break;
+ case AT_AddIndexConstraint:
+ /* This command never recurses */
+ /* No command-specific prep needed */
+ pass = AT_PASS_ADD_INDEXCONSTR;
+ break;
+ case AT_AddConstraint:
+ /* Recursion occurs during execution phase */
+ if (recurse)
+ cmd2->subtype = AT_AddConstraintRecurse;
+ switch (castNode(Constraint, cmd2->def)->contype)
+ {
+ case CONSTR_PRIMARY:
+ case CONSTR_UNIQUE:
+ case CONSTR_EXCLUSION:
+ pass = AT_PASS_ADD_INDEXCONSTR;
+ break;
+ default:
+ pass = AT_PASS_ADD_OTHERCONSTR;
+ break;
+ }
+ break;
+ case AT_AlterColumnGenericOptions:
+ /* This command never recurses */
+ /* No command-specific prep needed */
+ pass = AT_PASS_MISC;
+ break;
+ default:
+ pass = cur_pass;
+ break;
+ }
- if (newcmd == NULL &&
- (cmd->subtype == cmd2->subtype ||
- (cmd->subtype == AT_AddConstraintRecurse &&
- cmd2->subtype == AT_AddConstraint)))
+ if (pass < cur_pass)
+ {
+ /* Cannot schedule into a pass we already finished */
+ elog(ERROR, "ALTER TABLE scheduling failure: too late for pass %d",
+ pass);
+ }
+ else if (pass > cur_pass)
{
- /* Found the transformed version of our subcommand */
- cmd2->subtype = cmd->subtype; /* copy recursion flag */
- newcmd = cmd2;
+ /* OK, queue it up for later */
+ tab->subcmds[pass] = lappend(tab->subcmds[pass], cmd2);
}
else
{
- int pass;
-
/*
- * Schedule added subcommand appropriately. We assume we needn't
- * do any phase-1 checks for it. This switch only has to cover
- * the subcommand types that can be added by parse_utilcmd.c.
+ * We should see at most one subcommand for the current pass,
+ * which is the transformed version of the original subcommand.
*/
- switch (cmd2->subtype)
+ if (newcmd == NULL && cmd->subtype == cmd2->subtype)
{
- case AT_SetNotNull:
- /* Need command-specific recursion decision */
- ATPrepSetNotNull(wqueue, rel, cmd2,
- recurse, false,
- lockmode, context);
- pass = AT_PASS_COL_ATTRS;
- break;
- case AT_AddIndex:
- /* This command never recurses */
- /* No command-specific prep needed */
- pass = AT_PASS_ADD_INDEX;
- break;
- case AT_AddIndexConstraint:
- /* This command never recurses */
- /* No command-specific prep needed */
- pass = AT_PASS_ADD_INDEXCONSTR;
- break;
- case AT_AddConstraint:
- /* Recursion occurs during execution phase */
- if (recurse)
- cmd2->subtype = AT_AddConstraintRecurse;
- switch (castNode(Constraint, cmd2->def)->contype)
- {
- case CONSTR_PRIMARY:
- case CONSTR_UNIQUE:
- case CONSTR_EXCLUSION:
- pass = AT_PASS_ADD_INDEXCONSTR;
- break;
- default:
- pass = AT_PASS_ADD_OTHERCONSTR;
- break;
- }
- break;
- case AT_AlterColumnGenericOptions:
- /* This command never recurses */
- /* No command-specific prep needed */
- pass = AT_PASS_MISC;
- break;
- default:
- elog(ERROR, "unexpected AlterTableType: %d",
- (int) cmd2->subtype);
- pass = AT_PASS_UNSET;
- break;
+ /* Found the transformed version of our subcommand */
+ newcmd = cmd2;
}
- /* Must be for a later pass than we're currently doing */
- if (pass <= cur_pass)
- elog(ERROR, "ALTER TABLE scheduling failure");
- tab->subcmds[pass] = lappend(tab->subcmds[pass], cmd2);
+ else
+ elog(ERROR, "ALTER TABLE scheduling failure: bogus item for pass %d",
+ pass);
}
}
@@ -6859,6 +6933,35 @@ ATExecColumnDefault(Relation rel, const char *colName,
return address;
}
+/*
+ * Add a pre-cooked default expression.
+ *
+ * Return the address of the affected column.
+ */
+static ObjectAddress
+ATExecCookedColumnDefault(Relation rel, AttrNumber attnum,
+ Node *newDefault)
+{
+ ObjectAddress address;
+
+ /* We assume no checking is required */
+
+ /*
+ * Remove any old default for the column. We use RESTRICT here for
+ * safety, but at present we do not expect anything to depend on the
+ * default. (In ordinary cases, there could not be a default in place
+ * anyway, but it's possible when combining LIKE with inheritance.)
+ */
+ RemoveAttrDefault(RelationGetRelid(rel), attnum, DROP_RESTRICT, false,
+ true);
+
+ (void) StoreAttrDefault(rel, attnum, newDefault, true, false);
+
+ ObjectAddressSubSet(address, RelationRelationId,
+ RelationGetRelid(rel), attnum);
+ return address;
+}
+
/*
* ALTER TABLE ALTER COLUMN ADD IDENTITY
*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index aba13c31d1bc2..308a51d95d7ad 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -949,11 +949,11 @@ vacuum_set_xid_limits(Relation rel,
/*
* We can always ignore processes running lazy vacuum. This is because we
* use these values only for deciding which tuples we must keep in the
- * tables. Since lazy vacuum doesn't write its XID anywhere, it's safe to
- * ignore it. In theory it could be problematic to ignore lazy vacuums in
- * a full vacuum, but keep in mind that only one vacuum process can be
- * working on a particular table at any time, and that each vacuum is
- * always an independent transaction.
+ * tables. Since lazy vacuum doesn't write its XID anywhere (usually no
+ * XID assigned), it's safe to ignore it. In theory it could be
+ * problematic to ignore lazy vacuums in a full vacuum, but keep in mind
+ * that only one vacuum process can be working on a particular table at
+ * any time, and that each vacuum is always an independent transaction.
*/
*oldestXmin = GetOldestNonRemovableTransactionId(rel);
@@ -1128,8 +1128,8 @@ vacuum_set_xid_limits(Relation rel,
* live tuples seen; but if we did not, we should not blindly extrapolate
* from that number, since VACUUM may have scanned a quite nonrandom
* subset of the table. When we have only partial information, we take
- * the old value of pg_class.reltuples as a measurement of the
- * tuple density in the unscanned pages.
+ * the old value of pg_class.reltuples/pg_class.relpages as a measurement
+ * of the tuple density in the unscanned pages.
*
* Note: scanned_tuples should count only *live* tuples, since
* pg_class.reltuples is defined that way.
@@ -1152,18 +1152,16 @@ vac_estimate_reltuples(Relation relation,
/*
* If scanned_pages is zero but total_pages isn't, keep the existing value
- * of reltuples. (Note: callers should avoid updating the pg_class
- * statistics in this situation, since no new information has been
- * provided.)
+ * of reltuples. (Note: we might be returning -1 in this case.)
*/
if (scanned_pages == 0)
return old_rel_tuples;
/*
- * If old value of relpages is zero, old density is indeterminate; we
- * can't do much except scale up scanned_tuples to match total_pages.
+ * If old density is unknown, we can't do much except scale up
+ * scanned_tuples to match total_pages.
*/
- if (old_rel_pages == 0)
+ if (old_rel_tuples < 0 || old_rel_pages == 0)
return floor((scanned_tuples / scanned_pages) * total_pages + 0.5);
/*
@@ -1361,6 +1359,14 @@ vac_update_datfrozenxid(void)
bool bogus = false;
bool dirty = false;
+ /*
+ * Restrict this task to one backend per database. This avoids race
+ * conditions that would move datfrozenxid or datminmxid backward. It
+ * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+ * datfrozenxid passed to an earlier vac_truncate_clog() call.
+ */
+ LockDatabaseFrozenIds(ExclusiveLock);
+
/*
* Initialize the "min" calculation with
* GetOldestNonRemovableTransactionId(), which is a reasonable
@@ -1551,6 +1557,9 @@ vac_truncate_clog(TransactionId frozenXID,
bool bogus = false;
bool frozenAlreadyWrapped = false;
+ /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+ LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
/* init oldest datoids to sync with my frozenXID/minMulti values */
oldestxid_datoid = MyDatabaseId;
minmulti_datoid = MyDatabaseId;
@@ -1660,6 +1669,8 @@ vac_truncate_clog(TransactionId frozenXID,
*/
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+ LWLockRelease(WrapLimitsVacuumLock);
}
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index ea69eeb2a1e4b..6fb2acc4e1391 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -58,8 +58,9 @@ static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
static void *dense_alloc(HashJoinTable hashtable, Size size);
static HashJoinTuple ExecParallelHashTupleAlloc(HashJoinTable hashtable,
- size_t size,
- dsa_pointer *shared);
+ size_t size,
+ dsa_pointer *shared);
+static void ExecParallelHashTableEvictBatch0(HashJoinTable hashtable);
static void MultiExecPrivateHash(HashState *node);
static void MultiExecParallelHash(HashState *node);
static inline HashJoinTuple ExecParallelHashFirstTuple(HashJoinTable table,
@@ -72,6 +73,9 @@ static inline void ExecParallelHashPushTuple(dsa_pointer_atomic *head,
static void ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch);
static void ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable);
static void ExecParallelHashRepartitionFirst(HashJoinTable hashtable);
+static void ExecParallelHashRepartitionBatch0Tuple(HashJoinTable hashtable,
+ MinimalTuple tuple,
+ uint32 hashvalue);
static void ExecParallelHashRepartitionRest(HashJoinTable hashtable);
static HashMemoryChunk ExecParallelHashPopChunkQueue(HashJoinTable table,
dsa_pointer *shared);
@@ -184,13 +188,53 @@ MultiExecPrivateHash(HashState *node)
}
else
{
- /* Not subject to skew optimization, so insert normally */
- ExecHashTableInsert(hashtable, slot, hashvalue);
+ /*
+ * Not subject to skew optimization, so either insert normally
+ * or save to batch file if batch 0 falls back and we have
+ * already filled the hashtable up to space_allowed.
+ */
+ int bucketno;
+ int batchno;
+ bool shouldFree;
+ MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+ ExecHashGetBucketAndBatch(hashtable, hashvalue,
+ &bucketno, &batchno);
+
+ /*
+ * If we set batch 0 to fallback on the previous tuple Save
+ * the tuples in this batch which will not fit in the
+ * hashtable should I be checking that hashtable->curstripe !=
+ * 0?
+ */
+ if (hashtable->hashloopBatchFile && hashtable->hashloopBatchFile[0])
+ ExecHashJoinSaveTuple(tuple,
+ hashvalue,
+ &hashtable->innerBatchFile[batchno]);
+ else
+ ExecHashTableInsert(hashtable, slot, hashvalue);
+
+ if (shouldFree)
+ heap_free_minimal_tuple(tuple);
}
hashtable->totalTuples += 1;
}
}
+ /*
+ * If batch 0 fell back, rewind the inner side file where we saved the
+ * tuples which did not fit in memory to prepare it for loading upon
+ * finishing probing stripe 0 of batch 0
+ */
+ if (hashtable->innerBatchFile && hashtable->innerBatchFile[0])
+ {
+ if (BufFileSeek(hashtable->innerBatchFile[0], 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file: %m")));
+ }
+
+
/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
if (hashtable->nbuckets != hashtable->nbuckets_optimal)
ExecHashIncreaseNumBuckets(hashtable);
@@ -319,9 +363,9 @@ MultiExecParallelHash(HashState *node)
* are now fixed. While building them we made sure they'd fit
* in our memory budget when we load them back in later (or we
* tried to do that and gave up because we detected extreme
- * skew).
+ * skew and thus marked them to fall back).
*/
- pstate->growth = PHJ_GROWTH_DISABLED;
+ pstate->growth = PHJ_GROWTH_LOADING;
}
}
@@ -496,12 +540,14 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
hashtable->curbatch = 0;
hashtable->nbatch_original = nbatch;
hashtable->nbatch_outstart = nbatch;
- hashtable->growEnabled = true;
hashtable->totalTuples = 0;
hashtable->partialTuples = 0;
hashtable->skewTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
+ hashtable->hashloopBatchFile = NULL;
+ hashtable->fallback_batches_stats = NULL;
+ hashtable->curstripe = STRIPE_DETACHED;
hashtable->spaceUsed = 0;
hashtable->spacePeak = 0;
hashtable->spaceAllowed = space_allowed;
@@ -573,6 +619,8 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
palloc0(nbatch * sizeof(BufFile *));
hashtable->outerBatchFile = (BufFile **)
palloc0(nbatch * sizeof(BufFile *));
+ hashtable->hashloopBatchFile = (BufFile **)
+ palloc0(nbatch * sizeof(BufFile *));
/* The files will not be opened until needed... */
/* ... but make sure we have temp tablespaces established for them */
PrepareTempTablespaces();
@@ -856,18 +904,19 @@ ExecHashTableDestroy(HashJoinTable hashtable)
int i;
/*
- * Make sure all the temp files are closed. We skip batch 0, since it
- * can't have any temp files (and the arrays might not even exist if
- * nbatch is only 1). Parallel hash joins don't use these files.
+ * Make sure all the temp files are closed. Parallel hash joins don't use
+ * these files.
*/
if (hashtable->innerBatchFile != NULL)
{
- for (i = 1; i < hashtable->nbatch; i++)
+ for (i = 0; i < hashtable->nbatch; i++)
{
if (hashtable->innerBatchFile[i])
BufFileClose(hashtable->innerBatchFile[i]);
if (hashtable->outerBatchFile[i])
BufFileClose(hashtable->outerBatchFile[i]);
+ if (hashtable->hashloopBatchFile[i])
+ BufFileClose(hashtable->hashloopBatchFile[i]);
}
}
@@ -878,6 +927,18 @@ ExecHashTableDestroy(HashJoinTable hashtable)
pfree(hashtable);
}
+/*
+ * Threshhold for tuple relocation during batch split for parallel and serial
+ * hashjoin.
+ * While growing the number of batches, for the batch which triggered the growth,
+ * if more than MAX_RELOCATION % of its tuples move to its child batch, then
+ * it likely has skewed data and so the child batch (the new home to the skewed
+ * tuples) will be marked as a "fallback" batch and processed using the hashloop
+ * join algorithm. The reverse is true as well: if more than MAX_RELOCATION
+ * remain in the parent, it too should be marked to "fallback".
+ */
+#define MAX_RELOCATION 0.8
+
/*
* ExecHashIncreaseNumBatches
* increase the original number of batches in order to reduce
@@ -888,14 +949,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
{
int oldnbatch = hashtable->nbatch;
int curbatch = hashtable->curbatch;
+ int childbatch;
int nbatch;
MemoryContext oldcxt;
long ninmemory;
long nfreed;
HashMemoryChunk oldchunks;
+ int curbatch_outgoing_tuples;
+ int childbatch_outgoing_tuples;
+ int target_batch;
+ FallbackBatchStats *fallback_batch_stats;
+ size_t batchSize = 0;
- /* do nothing if we've decided to shut off growth */
- if (!hashtable->growEnabled)
+ if (hashtable->hashloopBatchFile && hashtable->hashloopBatchFile[curbatch])
return;
/* safety check to avoid overflow */
@@ -919,6 +985,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
palloc0(nbatch * sizeof(BufFile *));
hashtable->outerBatchFile = (BufFile **)
palloc0(nbatch * sizeof(BufFile *));
+ hashtable->hashloopBatchFile = (BufFile **)
+ palloc0(nbatch * sizeof(BufFile *));
/* time to establish the temp tablespaces, too */
PrepareTempTablespaces();
}
@@ -929,10 +997,14 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *));
hashtable->outerBatchFile = (BufFile **)
repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *));
+ hashtable->hashloopBatchFile = (BufFile **)
+ repalloc(hashtable->hashloopBatchFile, nbatch * sizeof(BufFile *));
MemSet(hashtable->innerBatchFile + oldnbatch, 0,
(nbatch - oldnbatch) * sizeof(BufFile *));
MemSet(hashtable->outerBatchFile + oldnbatch, 0,
(nbatch - oldnbatch) * sizeof(BufFile *));
+ MemSet(hashtable->hashloopBatchFile + oldnbatch, 0,
+ (nbatch - oldnbatch) * sizeof(BufFile *));
}
MemoryContextSwitchTo(oldcxt);
@@ -944,6 +1016,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
* no longer of the current batch.
*/
ninmemory = nfreed = 0;
+ curbatch_outgoing_tuples = childbatch_outgoing_tuples = 0;
+ childbatch = (1U << (my_log2(hashtable->nbatch) - 1)) | hashtable->curbatch;
/* If know we need to resize nbuckets, we can do it while rebatching. */
if (hashtable->nbuckets_optimal != hashtable->nbuckets)
@@ -990,7 +1064,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
&bucketno, &batchno);
- if (batchno == curbatch)
+ if (batchno == curbatch && (curbatch != 0 || batchSize + hashTupleSize < hashtable->spaceAllowed))
{
/* keep tuple in memory - copy it into the new chunk */
HashJoinTuple copyTuple;
@@ -1001,17 +1075,29 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
/* and add it back to the appropriate bucket */
copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
hashtable->buckets.unshared[bucketno] = copyTuple;
+ curbatch_outgoing_tuples++;
+ batchSize += hashTupleSize;
}
else
{
/* dump it out */
- Assert(batchno > curbatch);
+ Assert(batchno > curbatch || batchSize + hashTupleSize >= hashtable->spaceAllowed);
ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
hashTuple->hashvalue,
&hashtable->innerBatchFile[batchno]);
hashtable->spaceUsed -= hashTupleSize;
nfreed++;
+
+ /*
+ * TODO: what to do about tuples that don't go to the child
+ * batch or stay in the current batch? (this is why we are
+ * counting tuples to child and curbatch with two diff
+ * variables in case the tuples go to a batch that isn't the
+ * child)
+ */
+ if (batchno == childbatch)
+ childbatch_outgoing_tuples++;
}
/* next tuple in this chunk */
@@ -1032,21 +1118,33 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
#endif
/*
- * If we dumped out either all or none of the tuples in the table, disable
- * further expansion of nbatch. This situation implies that we have
- * enough tuples of identical hashvalues to overflow spaceAllowed.
- * Increasing nbatch will not fix it since there's no way to subdivide the
- * group any more finely. We have to just gut it out and hope the server
- * has enough RAM.
+ * The same batch should not be marked to fall back more than once
*/
- if (nfreed == 0 || nfreed == ninmemory)
- {
- hashtable->growEnabled = false;
#ifdef HJDEBUG
- printf("Hashjoin %p: disabling further increase of nbatch\n",
- hashtable);
+ if ((childbatch_outgoing_tuples / (float) ninmemory) >= 0.8)
+ printf("childbatch %i targeted to fallback.", childbatch);
+ if ((curbatch_outgoing_tuples / (float) ninmemory) >= 0.8)
+ printf("curbatch %i targeted to fallback.", curbatch);
#endif
- }
+
+ /*
+ * If too many tuples remain in the parent or too many tuples migrate to
+ * the child, there is likely skew and continuing to increase the number
+ * of batches will not help. Mark the batch which contains the skewed
+ * tuples to be processed with block nested hashloop join.
+ */
+ if ((childbatch_outgoing_tuples / (float) ninmemory) >= MAX_RELOCATION)
+ target_batch = childbatch;
+ else if ((curbatch_outgoing_tuples / (float) ninmemory) >= MAX_RELOCATION)
+ target_batch = curbatch;
+ else
+ return;
+ hashtable->hashloopBatchFile[target_batch] = BufFileCreateTemp(false);
+
+ fallback_batch_stats = palloc0(sizeof(FallbackBatchStats));
+ fallback_batch_stats->batchno = target_batch;
+ fallback_batch_stats->numstripes = 0;
+ hashtable->fallback_batches_stats = lappend(hashtable->fallback_batches_stats, fallback_batch_stats);
}
/*
@@ -1199,6 +1297,11 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
ExecParallelHashTableSetCurrentBatch(hashtable, 0);
/* Then partition, flush counters. */
ExecParallelHashRepartitionFirst(hashtable);
+
+ /*
+ * TODO: add a debugging check that confirms that all the tuples
+ * from the old generation are present in the new generation
+ */
ExecParallelHashRepartitionRest(hashtable);
ExecParallelHashMergeCounters(hashtable);
/* Wait for the above to be finished. */
@@ -1217,7 +1320,6 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
WAIT_EVENT_HASH_GROW_BATCHES_DECIDE))
{
bool space_exhausted = false;
- bool extreme_skew_detected = false;
/* Make sure that we have the current dimensions and buckets. */
ExecParallelHashEnsureBatchAccessors(hashtable);
@@ -1228,27 +1330,83 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
{
ParallelHashJoinBatch *batch = hashtable->batches[i].shared;
+ /*
+ * All batches were just created anew during
+ * repartitioning
+ */
+ Assert(!hashtable->batches[i].shared->hashloop_fallback);
+
+ /*
+ * At the time of repartitioning, each batch updates its
+ * estimated_size to reflect the size of the batch file on
+ * disk. It is also updated when increasing preallocated
+ * space in ExecParallelHashTuplePrealloc().
+ *
+ * Batch 0 is inserted into memory during the build stage,
+ * it can spill to a file, so the size member, which
+ * reflects the part of batch 0 in memory should never
+ * exceed the space_allowed.
+ */
+ Assert(batch->size <= pstate->space_allowed);
+
if (batch->space_exhausted ||
batch->estimated_size > pstate->space_allowed)
{
int parent;
+ float frac_moved;
space_exhausted = true;
+ parent = i % pstate->old_nbatch;
+ frac_moved = batch->ntuples / (float) hashtable->batches[parent].shared->old_ntuples;
+
/*
- * Did this batch receive ALL of the tuples from its
- * parent batch? That would indicate that further
- * repartitioning isn't going to help (the hash values
- * are probably all the same).
+ * If too many tuples remain in the parent or too many
+ * tuples migrate to the child, there is likely skew
+ * and continuing to increase the number of batches
+ * will not help. Mark the batch which contains the
+ * skewed tuples to be processed with block nested
+ * hashloop join.
*/
- parent = i % pstate->old_nbatch;
- if (batch->ntuples == hashtable->batches[parent].shared->old_ntuples)
- extreme_skew_detected = true;
+ if (frac_moved >= MAX_RELOCATION)
+ {
+ batch->hashloop_fallback = true;
+ space_exhausted = false;
+ }
}
+
+ /*
+ * If all of the tuples in the hashtable were put back in
+ * the hashtable during repartitioning, mark this batch as
+ * a fallback batch so that we will evict the tuples to a
+ * spill file were we to run out of space again This has
+ * the problem of wasting a lot of time during the probe
+ * phase if it turns out that we never try and allocate
+ * any more memory in the hashtable.
+ *
+ * TODO: It might be worth doing something to indicate
+ * that if all of the tuples went back into a batch but it
+ * only exactly used the space_allowed, that the batch is
+ * not a fallback batch yet but that the current stripe is
+ * full, so if you need to allocate more, it would mark it
+ * as a fallback batch. Otherwise, a batch 0 with no
+ * tuples in spill files will still be treated as a
+ * fallback batch during probing
+ */
+ if (i == 0 && hashtable->batches[0].shared->size == pstate->space_allowed)
+ {
+ if (hashtable->batches[0].shared->ntuples == hashtable->batches[0].shared->old_ntuples)
+ {
+ hashtable->batches[0].shared->hashloop_fallback = true;
+ space_exhausted = false;
+ }
+ }
+ if (space_exhausted)
+ break;
}
- /* Don't keep growing if it's not helping or we'd overflow. */
- if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2)
+ /* Don't keep growing if we'd overflow. */
+ if (hashtable->nbatch >= INT_MAX / 2)
pstate->growth = PHJ_GROWTH_DISABLED;
else if (space_exhausted)
pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
@@ -1276,65 +1434,153 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
static void
ExecParallelHashRepartitionFirst(HashJoinTable hashtable)
{
+ ParallelHashJoinState *pstate;
+
+ ParallelHashJoinBatch *old_shared;
+ SharedTuplestoreAccessor *old_inner_batch0_sts;
+
dsa_pointer chunk_shared;
HashMemoryChunk chunk;
- Assert(hashtable->nbatch == hashtable->parallel_state->nbatch);
+ ParallelHashJoinBatch *old_batches = (ParallelHashJoinBatch *) dsa_get_address(hashtable->area, hashtable->parallel_state->old_batches);
+
+ Assert(old_batches);
+ old_shared = NthParallelHashJoinBatch(old_batches, 0);
+ old_inner_batch0_sts = sts_attach(ParallelHashJoinBatchInner(old_shared), ParallelWorkerNumber + 1, &hashtable->parallel_state->fileset);
+
+ pstate = hashtable->parallel_state;
- while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared)))
+ Assert(hashtable->nbatch == hashtable->parallel_state->nbatch);
+ BarrierAttach(&pstate->repartition_barrier);
+ switch (PHJ_REPARTITION_BATCH0_PHASE(BarrierPhase(&pstate->repartition_barrier)))
{
- size_t idx = 0;
+ case PHJ_REPARTITION_BATCH0_DRAIN_QUEUE:
+ while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared)))
+ {
+ MinimalTuple tuple;
+ size_t idx = 0;
- /* Repartition all tuples in this chunk. */
- while (idx < chunk->used)
- {
- HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
- MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
- HashJoinTuple copyTuple;
- dsa_pointer shared;
- int bucketno;
- int batchno;
+ /*
+ * Repartition all tuples in this chunk. These tuples may be
+ * relocated to a batch file or may be inserted back into
+ * memory.
+ */
+ while (idx < chunk->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
- ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
- &bucketno, &batchno);
+ tuple = HJTUPLE_MINTUPLE(hashTuple);
- Assert(batchno < hashtable->nbatch);
- if (batchno == 0)
- {
- /* It still belongs in batch 0. Copy to a new chunk. */
- copyTuple =
- ExecParallelHashTupleAlloc(hashtable,
- HJTUPLE_OVERHEAD + tuple->t_len,
- &shared);
- copyTuple->hashvalue = hashTuple->hashvalue;
- memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len);
- ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
- copyTuple, shared);
+ ExecParallelHashRepartitionBatch0Tuple(hashtable,
+ tuple,
+ hashTuple->hashvalue);
+
+ idx += MAXALIGN(HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+
+ dsa_free(hashtable->area, chunk_shared);
+ CHECK_FOR_INTERRUPTS();
}
- else
+ BarrierArriveAndWait(&pstate->repartition_barrier, WAIT_EVENT_HASH_REPARTITION_BATCH0_DRAIN_QUEUE);
+ /* FALLTHROUGH */
+ case PHJ_REPARTITION_BATCH0_DRAIN_SPILL_FILE:
{
- size_t tuple_size =
- MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+ MinimalTuple tuple;
+ tupleMetadata metadata;
- /* It belongs in a later batch. */
- hashtable->batches[batchno].estimated_size += tuple_size;
- sts_puttuple(hashtable->batches[batchno].inner_tuples,
- &hashTuple->hashvalue, tuple);
+ /*
+ * Repartition all of the tuples in this spill file. These
+ * tuples may go back into the hashtable if space was freed up
+ * or they may go into another batch or they may go into the
+ * batch 0 spill file.
+ */
+ sts_begin_parallel_scan(old_inner_batch0_sts);
+ while ((tuple = sts_parallel_scan_next(old_inner_batch0_sts,
+ &metadata.hashvalue)))
+ {
+
+ ExecParallelHashRepartitionBatch0Tuple(hashtable,
+ tuple,
+ metadata.hashvalue);
+ }
+ sts_end_parallel_scan(old_inner_batch0_sts);
}
+ }
+ BarrierArriveAndDetach(&pstate->repartition_barrier);
+}
- /* Count this tuple. */
- ++hashtable->batches[0].old_ntuples;
- ++hashtable->batches[batchno].ntuples;
+static void
+ExecParallelHashRepartitionBatch0Tuple(HashJoinTable hashtable,
+ MinimalTuple tuple,
+ uint32 hashvalue)
+{
+ int batchno;
+ int bucketno;
+ dsa_pointer shared;
+ HashJoinTuple copyTuple;
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ bool spill = true;
+ bool hashtable_full = hashtable->batches[0].shared->size >= pstate->space_allowed;
+ size_t tuple_size =
+ MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
- idx += MAXALIGN(HJTUPLE_OVERHEAD +
- HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+
+ /*
+ * We don't take a lock to read pstate->space_allowed because it should
+ * not change during execution of the hash join
+ */
+
+ Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+ if (batchno == 0 && !hashtable_full)
+ {
+ copyTuple = ExecParallelHashTupleAlloc(hashtable,
+ HJTUPLE_OVERHEAD + tuple->t_len,
+ &shared);
+
+ /*
+ * TODO: do we need to check if growth was set to
+ * PHJ_GROWTH_SPILL_BATCH0?
+ */
+ if (copyTuple)
+ {
+ /* Store the hash value in the HashJoinTuple header. */
+ copyTuple->hashvalue = hashvalue;
+ memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len);
+
+ /* Push it onto the front of the bucket's list */
+ ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+ copyTuple, shared);
+ pg_atomic_add_fetch_u64(&hashtable->batches[0].shared->ntuples_in_memory, 1);
+
+ spill = false;
}
+ }
- /* Free this chunk. */
- dsa_free(hashtable->area, chunk_shared);
+ if (spill)
+ {
- CHECK_FOR_INTERRUPTS();
+ tupleMetadata metadata;
+
+ ParallelHashJoinBatchAccessor *batch_accessor = &(hashtable->batches[batchno]);
+
+ /*
+ * It is okay to use backend local here because force spill tuple is
+ * only done during repartitioning when we can't grow batches so won't
+ * make decision based on it and will merge counters during deciding
+ * and during evictbatch0 which can ony be done on a batch that is
+ * already fallback so we won't make decision on it and will merge
+ * counters after the build phase
+ */
+ batch_accessor->estimated_size += tuple_size;
+ metadata.hashvalue = hashvalue;
+
+ sts_puttuple(batch_accessor->inner_tuples,
+ &metadata,
+ tuple);
}
+ ++hashtable->batches[batchno].ntuples;
+ ++hashtable->batches[0].old_ntuples;
}
/*
@@ -1371,24 +1617,41 @@ ExecParallelHashRepartitionRest(HashJoinTable hashtable)
/* Scan one partition from the previous generation. */
sts_begin_parallel_scan(old_inner_tuples[i]);
- while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue)))
+ while ((tuple = sts_parallel_scan_next(old_inner_tuples[i],
+ &hashvalue)))
{
- size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
int bucketno;
int batchno;
+ size_t tuple_size;
+ tupleMetadata metadata;
+ ParallelHashJoinBatchAccessor *batch_accessor;
+
/* Decide which partition it goes to in the new generation. */
ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
&batchno);
- hashtable->batches[batchno].estimated_size += tuple_size;
- ++hashtable->batches[batchno].ntuples;
- ++hashtable->batches[i].old_ntuples;
+ tuple_size =
+ MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
- /* Store the tuple its new batch. */
- sts_puttuple(hashtable->batches[batchno].inner_tuples,
- &hashvalue, tuple);
+ batch_accessor = &(hashtable->batches[batchno]);
+ /*
+ * It is okay to use backend local here because force spill tuple
+ * is only done during repartitioning when we can't grow batches
+ * so won't make decision based on it and will merge counters
+ * during deciding and during evictbatch0 which can ony be done on
+ * a batch that is already fallback so we won't make decision on
+ * it and will merge counters after the build phase
+ */
+ batch_accessor->estimated_size += tuple_size;
+ metadata.hashvalue = hashvalue;
+
+ sts_puttuple(batch_accessor->inner_tuples,
+ &metadata,
+ tuple);
+ ++hashtable->batches[batchno].ntuples;
+ ++hashtable->batches[i].old_ntuples;
CHECK_FOR_INTERRUPTS();
}
sts_end_parallel_scan(old_inner_tuples[i]);
@@ -1705,7 +1968,7 @@ ExecParallelHashTableInsert(HashJoinTable hashtable,
hashTuple = ExecParallelHashTupleAlloc(hashtable,
HJTUPLE_OVERHEAD + tuple->t_len,
&shared);
- if (hashTuple == NULL)
+ if (!hashTuple)
goto retry;
/* Store the hash value in the HashJoinTuple header. */
@@ -1715,10 +1978,13 @@ ExecParallelHashTableInsert(HashJoinTable hashtable,
/* Push it onto the front of the bucket's list */
ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
hashTuple, shared);
+ pg_atomic_add_fetch_u64(&hashtable->batches[0].shared->ntuples_in_memory, 1);
+
}
else
{
size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+ tupleMetadata metadata;
Assert(batchno > 0);
@@ -1731,7 +1997,11 @@ ExecParallelHashTableInsert(HashJoinTable hashtable,
Assert(hashtable->batches[batchno].preallocated >= tuple_size);
hashtable->batches[batchno].preallocated -= tuple_size;
- sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue,
+
+ metadata.hashvalue = hashvalue;
+
+ sts_puttuple(hashtable->batches[batchno].inner_tuples,
+ &metadata,
tuple);
}
++hashtable->batches[batchno].ntuples;
@@ -1746,10 +2016,11 @@ ExecParallelHashTableInsert(HashJoinTable hashtable,
* to other batches or to run out of memory, and should only be called with
* tuples that belong in the current batch once growth has been disabled.
*/
-void
+MinimalTuple
ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
TupleTableSlot *slot,
- uint32 hashvalue)
+ uint32 hashvalue,
+ int read_participant)
{
bool shouldFree;
MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
@@ -1758,19 +2029,26 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
int batchno;
int bucketno;
+
ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
Assert(batchno == hashtable->curbatch);
+
hashTuple = ExecParallelHashTupleAlloc(hashtable,
HJTUPLE_OVERHEAD + tuple->t_len,
&shared);
+ if (!hashTuple)
+ return NULL;
+
hashTuple->hashvalue = hashvalue;
memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
hashTuple, shared);
+ pg_atomic_add_fetch_u64(&hashtable->batches[hashtable->curbatch].shared->ntuples_in_memory, 1);
if (shouldFree)
heap_free_minimal_tuple(tuple);
+ return tuple;
}
/*
@@ -2602,6 +2880,12 @@ ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt)
pcxt->nworkers * sizeof(HashInstrumentation);
node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
+ /*
+ * TODO: the linked list which is being used for fallback stats needs
+ * space allocated for it in shared memory as well. For now, it seems to
+ * be coincidentally working
+ */
+
/* Each per-worker area must start out as zeroes. */
memset(node->shared_info, 0, size);
@@ -2701,6 +2985,11 @@ ExecHashAccumInstrumentation(HashInstrumentation *instrument,
hashtable->nbatch_original);
instrument->space_peak = Max(instrument->space_peak,
hashtable->spacePeak);
+
+ /*
+ * TODO: this doesn't work right now in case of rescan (doesn't get max)
+ */
+ instrument->fallback_batches_stats = hashtable->fallback_batches_stats;
}
/*
@@ -2775,6 +3064,146 @@ dense_alloc(HashJoinTable hashtable, Size size)
return ptr;
}
+/*
+ * Assume caller has a lock or is behind a barrier and has the right
+ * to change these values
+ */
+inline void
+ExecParallelHashTableRecycle(HashJoinTable hashtable)
+{
+ ParallelHashJoinBatchAccessor *batch_accessor = &(hashtable->batches[hashtable->curbatch]);
+ ParallelHashJoinBatch *batch = batch_accessor->shared;
+
+ dsa_pointer_atomic *buckets = (dsa_pointer_atomic *)
+ dsa_get_address(hashtable->area, batch->buckets);
+
+ for (size_t i = 0; i < hashtable->nbuckets; ++i)
+ dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer);
+ batch->size = 0;
+ batch->space_exhausted = false;
+
+ /*
+ * TODO: I'm not sure that we want to reset this when this function is
+ * called to recycle the hashtable during the build stage as part of
+ * evicting batch 0. It seems like it would be okay since a worker does
+ * not have the right to over-allocate now. So, for a fallback batch,
+ * at_least_one_chunk doesn't matter It seems like it may not matter at
+ * all anymore...
+ */
+ batch_accessor->at_least_one_chunk = false;
+ pg_atomic_exchange_u64(&batch->ntuples_in_memory, 0);
+}
+
+/*
+ * The eviction phase machine is responsible for evicting tuples from the
+ * hashtable during the Build stage of executing a parallel-aware parallel
+ * hash join. After increasing the number of batches in
+ * ExecParallelHashIncreaseNumBatches(), in the PHJ_GROW_BATCHES_DECIDING
+ * phase, if the batch 0 hashtable meets the criteria for falling back
+ * and is marked a fallback batch, the next time an inserted tuple would
+ * exceed the space_allowed, instead, trigger an eviction. Evict all
+ * batch 0 tuples to spill files in batch 0 inner side SharedTuplestore.
+ */
+static void
+ExecParallelHashTableEvictBatch0(HashJoinTable hashtable)
+{
+
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ ParallelHashJoinBatchAccessor *batch0_accessor = &(hashtable->batches[0]);
+
+ /*
+ * No other workers must be inserting tuples into the hashtable once
+ * growth has been set to PHJ_EVICT. Otherwise, the below will not work
+ * correctly. This should be okay since the same assumptions are made in
+ * the increase batch machine.
+ */
+ BarrierAttach(&pstate->eviction_barrier);
+ switch (PHJ_EVICT_PHASE(BarrierPhase(&pstate->eviction_barrier)))
+ {
+ case PHJ_EVICT_ELECTING:
+ if (BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_ELECT))
+ {
+ pstate->chunk_work_queue = batch0_accessor->shared->chunks;
+ batch0_accessor->shared->chunks = InvalidDsaPointer;
+ ExecParallelHashTableRecycle(hashtable);
+ }
+ /* FALLTHROUGH */
+ case PHJ_EVICT_RESETTING:
+ BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_RESET);
+ /* FALLTHROUGH */
+ case PHJ_EVICT_SPILLING:
+ {
+ dsa_pointer chunk_shared;
+ HashMemoryChunk chunk;
+
+ /*
+ * TODO: Do I need to do this here? am I guaranteed to have
+ * the correct shared memory reference to the batches array
+ * already?
+ */
+ ParallelHashJoinBatch *batches;
+ ParallelHashJoinBatch *batch0;
+
+ batches = (ParallelHashJoinBatch *)
+ dsa_get_address(hashtable->area, pstate->batches);
+ batch0 = NthParallelHashJoinBatch(batches, 0);
+ Assert(batch0 == hashtable->batches[0].shared);
+
+ ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+
+ while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared)))
+ {
+ size_t idx = 0;
+
+ while (idx < chunk->used)
+ {
+ tupleMetadata metadata;
+
+ size_t tuple_size;
+ MinimalTuple minTuple;
+ HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+
+ minTuple = HJTUPLE_MINTUPLE(hashTuple);
+
+ tuple_size =
+ MAXALIGN(HJTUPLE_OVERHEAD + minTuple->t_len);
+
+ /*
+ * It is okay to use backend local here because can
+ * ony be done on a batch that is already fallback so
+ * we won't make decision on it and will merge
+ * counters after the build phase
+ */
+ batch0_accessor->estimated_size += tuple_size;
+ metadata.hashvalue = hashTuple->hashvalue;
+
+ sts_puttuple(batch0_accessor->inner_tuples,
+ &metadata,
+ minTuple);
+
+ idx += MAXALIGN(HJTUPLE_OVERHEAD +
+ HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+ dsa_free(hashtable->area, chunk_shared);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+ BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_SPILL);
+ }
+ /* FALLTHROUGH */
+ case PHJ_EVICT_FINISHING:
+
+ /*
+ * TODO: Is this phase needed?
+ */
+ if (BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_FINISH))
+ pstate->growth = PHJ_GROWTH_OK;
+ /* FALLTHROUGH */
+ case PHJ_EVICT_DONE:
+ BarrierArriveAndDetach(&pstate->eviction_barrier);
+ }
+}
+
/*
* Allocate space for a tuple in shared dense storage. This is equivalent to
* dense_alloc but for Parallel Hash using shared memory.
@@ -2787,7 +3216,8 @@ dense_alloc(HashJoinTable hashtable, Size size)
* possibility that the tuple no longer belongs in the same batch).
*/
static HashJoinTuple
-ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
+ExecParallelHashTupleAlloc(HashJoinTable hashtable,
+ size_t size,
dsa_pointer *shared)
{
ParallelHashJoinState *pstate = hashtable->parallel_state;
@@ -2828,7 +3258,8 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
* Check if we need to help increase the number of buckets or batches.
*/
if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
- pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+ pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS ||
+ pstate->growth == PHJ_GROWTH_SPILL_BATCH0)
{
ParallelHashGrowth growth = pstate->growth;
@@ -2840,6 +3271,8 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
ExecParallelHashIncreaseNumBatches(hashtable);
else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
ExecParallelHashIncreaseNumBuckets(hashtable);
+ else if (growth == PHJ_GROWTH_SPILL_BATCH0)
+ ExecParallelHashTableEvictBatch0(hashtable);
/* The caller must retry. */
return NULL;
@@ -2852,7 +3285,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
chunk_size = HASH_CHUNK_SIZE;
/* Check if it's time to grow batches or buckets. */
- if (pstate->growth != PHJ_GROWTH_DISABLED)
+ if (pstate->growth != PHJ_GROWTH_DISABLED && pstate->growth != PHJ_GROWTH_LOADING)
{
Assert(curbatch == 0);
Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
@@ -2861,16 +3294,26 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
* Check if our space limit would be exceeded. To avoid choking on
* very large tuples or very low hash_mem setting, we'll always allow
* each backend to allocate at least one chunk.
+ *
+ * If the batch has already been marked to fall back, then we don't
+ * need to worry about having allocated one chunk -- we should start
+ * evicting tuples.
*/
- if (hashtable->batches[0].at_least_one_chunk &&
- hashtable->batches[0].shared->size +
+ LWLockAcquire(&hashtable->batches[0].shared->lock, LW_EXCLUSIVE);
+ if (hashtable->batches[0].shared->size +
chunk_size > pstate->space_allowed)
{
- pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
- hashtable->batches[0].shared->space_exhausted = true;
- LWLockRelease(&pstate->lock);
-
- return NULL;
+ if (hashtable->batches[0].shared->hashloop_fallback || hashtable->batches[0].at_least_one_chunk)
+ {
+ if (hashtable->batches[0].shared->hashloop_fallback)
+ pstate->growth = PHJ_GROWTH_SPILL_BATCH0;
+ else if (hashtable->batches[0].at_least_one_chunk)
+ pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+ hashtable->batches[0].shared->space_exhausted = true;
+ LWLockRelease(&pstate->lock);
+ LWLockRelease(&hashtable->batches[0].shared->lock);
+ return NULL;
+ }
}
/* Check if our load factor limit would be exceeded. */
@@ -2887,14 +3330,60 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
{
pstate->growth = PHJ_GROWTH_NEED_MORE_BUCKETS;
LWLockRelease(&pstate->lock);
+ LWLockRelease(&hashtable->batches[0].shared->lock);
return NULL;
}
}
+ LWLockRelease(&hashtable->batches[0].shared->lock);
}
+ /*
+ * TODO: should I care about hashtable->batches[b].at_least_one_chunk
+ * here?
+ */
+ if (pstate->growth == PHJ_GROWTH_LOADING)
+ {
+ int b = hashtable->curbatch;
+
+ LWLockAcquire(&hashtable->batches[b].shared->lock, LW_EXCLUSIVE);
+ if (hashtable->batches[b].shared->hashloop_fallback &&
+ (hashtable->batches[b].shared->space_exhausted ||
+ hashtable->batches[b].shared->size + chunk_size > pstate->space_allowed))
+ {
+ bool space_exhausted = hashtable->batches[b].shared->space_exhausted;
+
+ if (!space_exhausted)
+ hashtable->batches[b].shared->space_exhausted = true;
+ LWLockRelease(&pstate->lock);
+ LWLockRelease(&hashtable->batches[b].shared->lock);
+ return NULL;
+ }
+ LWLockRelease(&hashtable->batches[b].shared->lock);
+ }
+
+ /*
+ * If not even one chunk would fit in the space_allowed, there isn't
+ * anything we can do to avoid exceeding space_allowed. Also, if we keep
+ * the rule that a backend should be allowed to allocate at least one
+ * chunk, then we will end up tripping this assert some of the time unless
+ * we make that exception (should we make that exception?) TODO: should
+ * memory settings < chunk_size even be allowed. Should it error out?
+ * should we be able to make this assertion?
+ * Assert(hashtable->batches[hashtable->curbatch].shared->size +
+ * chunk_size <= pstate->space_allowed);
+ */
+
/* We are cleared to allocate a new chunk. */
chunk_shared = dsa_allocate(hashtable->area, chunk_size);
+
+ /*
+ * The chunk is accounted for in the hashtable size only. Even though
+ * batch 0 can spill, we don't need to track this allocated chunk in the
+ * estimated_stripe_size member because we check the size member when
+ * determining if the hashtable is too big, and, we will only ever number
+ * stripes (starting with 1 instead of 0 for batch 0) in the spill file.
+ */
hashtable->batches[curbatch].shared->size += chunk_size;
hashtable->batches[curbatch].at_least_one_chunk = true;
@@ -2964,21 +3453,40 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)
{
ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
+ SharedBits *sbits = ParallelHashJoinBatchOuterBits(shared, pstate->nparticipants);
char name[MAXPGPATH];
+ char sbname[MAXPGPATH];
+
+ shared->hashloop_fallback = false;
+ pg_atomic_init_flag(&shared->overflow_required);
+ pg_atomic_init_u64(&shared->ntuples_in_memory, 0);
+ /* TODO: is it okay to use the same tranche for this lock? */
+ LWLockInitialize(&shared->lock, LWTRANCHE_PARALLEL_HASH_JOIN);
+ shared->nstripes = 0;
/*
* All members of shared were zero-initialized. We just need to set
* up the Barrier.
*/
BarrierInit(&shared->batch_barrier, 0);
+ BarrierInit(&shared->stripe_barrier, 0);
+
+ /* Batch 0 doesn't need to be loaded. */
if (i == 0)
{
- /* Batch 0 doesn't need to be loaded. */
+ shared->nstripes = 1;
BarrierAttach(&shared->batch_barrier);
- while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBING)
+ while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_STRIPING)
BarrierArriveAndWait(&shared->batch_barrier, 0);
BarrierDetach(&shared->batch_barrier);
+
+ BarrierAttach(&shared->stripe_barrier);
+ while (BarrierPhase(&shared->stripe_barrier) < PHJ_STRIPE_PROBING)
+ BarrierArriveAndWait(&shared->stripe_barrier, 0);
+ BarrierDetach(&shared->stripe_barrier);
}
+ /* why isn't done initialized here ? */
+ accessor->done = PHJ_BATCH_ACCESSOR_NOT_DONE;
/* Initialize accessor state. All members were zero-initialized. */
accessor->shared = shared;
@@ -2989,7 +3497,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)
sts_initialize(ParallelHashJoinBatchInner(shared),
pstate->nparticipants,
ParallelWorkerNumber + 1,
- sizeof(uint32),
+ sizeof(tupleMetadata),
SHARED_TUPLESTORE_SINGLE_PASS,
&pstate->fileset,
name);
@@ -2999,10 +3507,14 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)
pstate->nparticipants),
pstate->nparticipants,
ParallelWorkerNumber + 1,
- sizeof(uint32),
+ sizeof(tupleMetadata),
SHARED_TUPLESTORE_SINGLE_PASS,
&pstate->fileset,
name);
+ snprintf(sbname, MAXPGPATH, "%s.bitmaps", name);
+ /* Use the same SharedFileset for the SharedTupleStore and SharedBits */
+ accessor->sba = sb_initialize(sbits, pstate->nparticipants,
+ ParallelWorkerNumber + 1, &pstate->fileset, sbname);
}
MemoryContextSwitchTo(oldcxt);
@@ -3051,8 +3563,8 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
* It's possible for a backend to start up very late so that the whole
* join is finished and the shm state for tracking batches has already
* been freed by ExecHashTableDetach(). In that case we'll just leave
- * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
- * up early.
+ * hashtable->batches as NULL so that ExecParallelHashJoinAdvanceBatch()
+ * gives up early.
*/
if (!DsaPointerIsValid(pstate->batches))
return;
@@ -3074,10 +3586,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
{
ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
+ SharedBits *sbits = ParallelHashJoinBatchOuterBits(shared, pstate->nparticipants);
accessor->shared = shared;
accessor->preallocated = 0;
- accessor->done = false;
+ accessor->done = PHJ_BATCH_ACCESSOR_NOT_DONE;
accessor->inner_tuples =
sts_attach(ParallelHashJoinBatchInner(shared),
ParallelWorkerNumber + 1,
@@ -3087,6 +3600,7 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
pstate->nparticipants),
ParallelWorkerNumber + 1,
&pstate->fileset);
+ accessor->sba = sb_attach(sbits, ParallelWorkerNumber + 1, &pstate->fileset);
}
MemoryContextSwitchTo(oldcxt);
@@ -3169,6 +3683,18 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
}
}
+bool
+ExecHashTableDetachStripe(HashJoinTable hashtable)
+{
+ int curbatch = hashtable->curbatch;
+ ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
+ Barrier *stripe_barrier = &batch->stripe_barrier;
+
+ BarrierDetach(stripe_barrier);
+ hashtable->curstripe = STRIPE_DETACHED;
+ return false;
+}
+
/*
* Detach from all shared resources. If we are last to detach, clean up.
*/
@@ -3326,7 +3852,6 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
ParallelHashJoinBatchAccessor *batch = &hashtable->batches[batchno];
size_t want = Max(size, HASH_CHUNK_SIZE - HASH_CHUNK_HEADER_SIZE);
- Assert(batchno > 0);
Assert(batchno < hashtable->nbatch);
Assert(size == MAXALIGN(size));
@@ -3334,7 +3859,8 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
/* Has another participant commanded us to help grow? */
if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
- pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+ pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS ||
+ pstate->growth == PHJ_GROWTH_SPILL_BATCH0)
{
ParallelHashGrowth growth = pstate->growth;
@@ -3343,18 +3869,21 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
ExecParallelHashIncreaseNumBatches(hashtable);
else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
ExecParallelHashIncreaseNumBuckets(hashtable);
+ else if (growth == PHJ_GROWTH_SPILL_BATCH0)
+ ExecParallelHashTableEvictBatch0(hashtable);
return false;
}
if (pstate->growth != PHJ_GROWTH_DISABLED &&
batch->at_least_one_chunk &&
- (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE
- > pstate->space_allowed))
+ (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE > pstate->space_allowed) &&
+ !batch->shared->hashloop_fallback)
{
/*
* We have determined that this batch would exceed the space budget if
- * loaded into memory. Command all participants to help repartition.
+ * loaded into memory. It is also not yet marked as a fallback batch.
+ * Command all participants to help repartition.
*/
batch->shared->space_exhausted = true;
pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 5532b91a71dca..eb67aceebb746 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -92,6 +92,27 @@
* hash_mem of all participants to create a large shared hash table. If that
* turns out either at planning or execution time to be impossible then we
* fall back to regular hash_mem sized hash tables.
+ * If a given batch causes the number of batches to be doubled and data skew
+ * causes too few or too many tuples to be relocated to the child of this batch,
+ * the batch which is now home to the skewed tuples is marked as a "fallback"
+ * batch. This means that it will be processed using multiple loops --
+ * each loop probing an arbitrary stripe of tuples from this batch
+ * which fit in hash_mem or combined hash_mem.
+ * This batch is no longer permitted to cause growth in the number of batches.
+ *
+ * When the inner side of a fallback batch is loaded into memory, stripes of
+ * arbitrary tuples totaling hash_mem or combined hash_mem in size are loaded
+ * into the hashtable. After probing this stripe, the outer side batch is
+ * rewound and the next stripe is loaded. Each stripe of the inner batch is
+ * probed until all tuples from that batch have been processed.
+ *
+ * Tuples that match are emitted (depending on the join semantics of the
+ * particular join type) during probing of the stripe. However, in order to make
+ * left outer join work, unmatched tuples cannot be emitted NULL-extended until
+ * all stripes have been probed. To address this, a bitmap is created with a bit
+ * for each tuple of the outer side. If a tuple on the outer side matches a
+ * tuple from the inner, the corresponding bit is set. At the end of probing all
+ * stripes, the executor scans the bitmap and emits unmatched outer tuples.
*
* To avoid deadlocks, we never wait for any barrier unless it is known that
* all other backends attached to it are actively executing the node or have
@@ -126,7 +147,7 @@
#define HJ_SCAN_BUCKET 3
#define HJ_FILL_OUTER_TUPLE 4
#define HJ_FILL_INNER_TUPLES 5
-#define HJ_NEED_NEW_BATCH 6
+#define HJ_NEED_NEW_STRIPE 6
/* Returns true if doing null-fill on outer relation */
#define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL)
@@ -143,10 +164,91 @@ static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
BufFile *file,
uint32 *hashvalue,
TupleTableSlot *tupleSlot);
+static int ExecHashJoinLoadStripe(HashJoinState *hjstate);
static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate);
+static bool ExecParallelHashJoinLoadStripe(HashJoinState *hjstate);
static void ExecParallelHashJoinPartitionOuter(HashJoinState *node);
+static bool checkbit(HashJoinState *hjstate);
+static void set_match_bit(HashJoinState *hjstate);
+
+static pg_attribute_always_inline bool
+ IsHashloopFallback(HashJoinTable hashtable);
+
+#define UINT_BITS (sizeof(unsigned int) * CHAR_BIT)
+
+static void
+set_match_bit(HashJoinState *hjstate)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ BufFile *statusFile = hashtable->hashloopBatchFile[hashtable->curbatch];
+ int tupindex = hjstate->hj_CurNumOuterTuples - 1;
+ size_t unit_size = sizeof(hjstate->hj_CurOuterMatchStatus);
+ off_t offset = tupindex / UINT_BITS * unit_size;
+
+ int fileno;
+ off_t cursor;
+
+ BufFileTell(statusFile, &fileno, &cursor);
+
+ /* Extend the statusFile if this is stripe zero. */
+ if (hashtable->curstripe == 0)
+ {
+ for (; cursor < offset + unit_size; cursor += unit_size)
+ {
+ hjstate->hj_CurOuterMatchStatus = 0;
+ BufFileWrite(statusFile, &hjstate->hj_CurOuterMatchStatus, unit_size);
+ }
+ }
+
+ if (cursor != offset)
+ BufFileSeek(statusFile, 0, offset, SEEK_SET);
+
+ BufFileRead(statusFile, &hjstate->hj_CurOuterMatchStatus, unit_size);
+ BufFileSeek(statusFile, 0, -unit_size, SEEK_CUR);
+
+ hjstate->hj_CurOuterMatchStatus |= 1U << tupindex % UINT_BITS;
+ BufFileWrite(statusFile, &hjstate->hj_CurOuterMatchStatus, unit_size);
+}
+/* return true if bit is set and false if not */
+static bool
+checkbit(HashJoinState *hjstate)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int curbatch = hashtable->curbatch;
+ BufFile *outer_match_statuses;
+
+ int bitno = hjstate->hj_EmitOuterTupleId % UINT_BITS;
+
+ hjstate->hj_EmitOuterTupleId++;
+ outer_match_statuses = hjstate->hj_HashTable->hashloopBatchFile[curbatch];
+
+ /*
+ * if current chunk of bitmap is exhausted, read next chunk of bitmap from
+ * outer_match_status_file
+ */
+ if (bitno == 0)
+ BufFileRead(outer_match_statuses, &hjstate->hj_CurOuterMatchStatus,
+ sizeof(hjstate->hj_CurOuterMatchStatus));
+
+ /*
+ * check if current tuple's match bit is set in outer match status file
+ */
+ return hjstate->hj_CurOuterMatchStatus & (1U << bitno);
+}
+
+static bool
+IsHashloopFallback(HashJoinTable hashtable)
+{
+ if (hashtable->parallel_state)
+ return hashtable->batches[hashtable->curbatch].shared->hashloop_fallback;
+
+ if (!hashtable->hashloopBatchFile)
+ return false;
+
+ return hashtable->hashloopBatchFile[hashtable->curbatch];
+}
/* ----------------------------------------------------------------
* ExecHashJoinImpl
@@ -290,6 +392,12 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
hashNode->hashtable = hashtable;
(void) MultiExecProcNode((PlanState *) hashNode);
+ /*
+ * After building the hashtable, stripe 0 of batch 0 will have
+ * been loaded.
+ */
+ hashtable->curstripe = 0;
+
/*
* If the inner relation is completely empty, and we're not
* doing a left outer join, we can quit without scanning the
@@ -324,21 +432,21 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
* If multi-batch, we need to hash the outer relation
* up front.
*/
- if (hashtable->nbatch > 1)
+ if (hashtable->nbatch > 1 || (hashtable->nbatch == 1 && hashtable->batches[0].shared->hashloop_fallback))
ExecParallelHashJoinPartitionOuter(node);
BarrierArriveAndWait(build_barrier,
WAIT_EVENT_HASH_BUILD_HASH_OUTER);
+
}
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
/* Each backend should now select a batch to work on. */
hashtable->curbatch = -1;
- node->hj_JoinState = HJ_NEED_NEW_BATCH;
- continue;
+ if (!ExecParallelHashJoinNewBatch(node))
+ return NULL;
}
- else
- node->hj_JoinState = HJ_NEED_NEW_OUTER;
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
/* FALL THRU */
@@ -365,12 +473,18 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
node->hj_JoinState = HJ_FILL_INNER_TUPLES;
}
else
- node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ node->hj_JoinState = HJ_NEED_NEW_STRIPE;
continue;
}
econtext->ecxt_outertuple = outerTupleSlot;
- node->hj_MatchedOuter = false;
+
+ /*
+ * Don't reset hj_MatchedOuter after the first stripe as it
+ * would cancel out whatever we found before
+ */
+ if (node->hj_HashTable->curstripe == 0)
+ node->hj_MatchedOuter = false;
/*
* Find the corresponding bucket for this tuple in the main
@@ -386,9 +500,15 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
/*
* The tuple might not belong to the current batch (where
* "current batch" includes the skew buckets if any).
+ *
+ * This should only be done once per tuple per batch. If a
+ * batch "falls back", its inner side will be split into
+ * stripes. Any displaced outer tuples should only be
+ * relocated while probing the first stripe of the inner side.
*/
if (batchno != hashtable->curbatch &&
- node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
+ node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO &&
+ node->hj_HashTable->curstripe == 0)
{
bool shouldFree;
MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot,
@@ -410,6 +530,13 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
continue;
}
+ /*
+ * While probing the phantom stripe, don't increment
+ * hj_CurNumOuterTuples or extend the bitmap
+ */
+ if (!parallel && hashtable->curstripe != PHANTOM_STRIPE)
+ node->hj_CurNumOuterTuples++;
+
/* OK, let's scan the bucket for matches */
node->hj_JoinState = HJ_SCAN_BUCKET;
@@ -455,6 +582,25 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
{
node->hj_MatchedOuter = true;
+ if (HJ_FILL_OUTER(node) && IsHashloopFallback(hashtable))
+ {
+ /*
+ * Each bit corresponds to a single tuple. Setting the
+ * match bit keeps track of which tuples were matched
+ * for batches which are using the block nested
+ * hashloop fallback method. It persists this match
+ * status across multiple stripes of tuples, each of
+ * which is loaded into the hashtable and probed. The
+ * outer match status file is the cumulative match
+ * status of outer tuples for a given batch across all
+ * stripes of that inner side batch.
+ */
+ if (parallel)
+ sb_setbit(hashtable->batches[hashtable->curbatch].sba, econtext->ecxt_outertuple->tts_tuplenum);
+ else
+ set_match_bit(node);
+ }
+
if (parallel)
{
/*
@@ -488,8 +634,17 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
* continue with next outer tuple.
*/
if (node->js.single_match)
+ {
node->hj_JoinState = HJ_NEED_NEW_OUTER;
+ /*
+ * Only consider returning the tuple while on the
+ * first stripe.
+ */
+ if (node->hj_HashTable->curstripe != 0)
+ continue;
+ }
+
if (otherqual == NULL || ExecQual(otherqual, econtext))
return ExecProject(node->js.ps.ps_ProjInfo);
else
@@ -508,6 +663,22 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
*/
node->hj_JoinState = HJ_NEED_NEW_OUTER;
+ if (IsHashloopFallback(hashtable) && HJ_FILL_OUTER(node))
+ {
+ if (hashtable->curstripe != PHANTOM_STRIPE)
+ continue;
+
+ if (parallel)
+ {
+ ParallelHashJoinBatchAccessor *accessor =
+ &node->hj_HashTable->batches[node->hj_HashTable->curbatch];
+
+ node->hj_MatchedOuter = sb_checkbit(accessor->sba, econtext->ecxt_outertuple->tts_tuplenum);
+ }
+ else
+ node->hj_MatchedOuter = checkbit(node);
+ }
+
if (!node->hj_MatchedOuter &&
HJ_FILL_OUTER(node))
{
@@ -534,7 +705,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
if (!ExecScanHashTableForUnmatched(node, econtext))
{
/* no more unmatched tuples */
- node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ node->hj_JoinState = HJ_NEED_NEW_STRIPE;
continue;
}
@@ -550,19 +721,23 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
InstrCountFiltered2(node, 1);
break;
- case HJ_NEED_NEW_BATCH:
+ case HJ_NEED_NEW_STRIPE:
/*
- * Try to advance to next batch. Done if there are no more.
+ * Try to advance to next stripe. Then try to advance to the
+ * next batch if there are no more stripes in this batch. Done
+ * if there are no more batches.
*/
if (parallel)
{
- if (!ExecParallelHashJoinNewBatch(node))
+ if (!ExecParallelHashJoinLoadStripe(node) &&
+ !ExecParallelHashJoinNewBatch(node))
return NULL; /* end of parallel-aware join */
}
else
{
- if (!ExecHashJoinNewBatch(node))
+ if (!ExecHashJoinLoadStripe(node) &&
+ !ExecHashJoinNewBatch(node))
return NULL; /* end of parallel-oblivious join */
}
node->hj_JoinState = HJ_NEED_NEW_OUTER;
@@ -751,6 +926,8 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
hjstate->hj_MatchedOuter = false;
hjstate->hj_OuterNotEmpty = false;
+ hjstate->hj_CurNumOuterTuples = 0;
+ hjstate->hj_CurOuterMatchStatus = 0;
return hjstate;
}
@@ -890,10 +1067,16 @@ ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
/*
* In the Parallel Hash case we only run the outer plan directly for
* single-batch hash joins. Otherwise we have to go to batch files, even
- * for batch 0.
+ * for batch 0. For a single-batch hash join which, due to data skew, has
+ * multiple stripes and is a "fallback" batch, we must still save the
+ * outer tuples into batch files.
*/
- if (curbatch == 0 && hashtable->nbatch == 1)
+ LWLockAcquire(&hashtable->batches[0].shared->lock, LW_SHARED);
+
+ if (curbatch == 0 && hashtable->nbatch == 1 && !hashtable->batches[0].shared->hashloop_fallback)
{
+ LWLockRelease(&hashtable->batches[0].shared->lock);
+
slot = ExecProcNode(outerNode);
while (!TupIsNull(slot))
@@ -917,21 +1100,36 @@ ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
}
else if (curbatch < hashtable->nbatch)
{
+
+ tupleMetadata metadata;
MinimalTuple tuple;
- tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples,
- hashvalue);
+ LWLockRelease(&hashtable->batches[0].shared->lock);
+
+ tuple =
+ sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples,
+ &metadata);
+ *hashvalue = metadata.hashvalue;
+
if (tuple != NULL)
{
ExecForceStoreMinimalTuple(tuple,
hjstate->hj_OuterTupleSlot,
false);
+
+ /*
+ * TODO: should we use tupleid instead of position in the serial
+ * case too?
+ */
+ hjstate->hj_OuterTupleSlot->tts_tuplenum = metadata.tupleid;
slot = hjstate->hj_OuterTupleSlot;
return slot;
}
else
ExecClearTuple(hjstate->hj_OuterTupleSlot);
}
+ else
+ LWLockRelease(&hashtable->batches[0].shared->lock);
/* End of this batch */
return NULL;
@@ -949,24 +1147,37 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
HashJoinTable hashtable = hjstate->hj_HashTable;
int nbatch;
int curbatch;
- BufFile *innerFile;
- TupleTableSlot *slot;
- uint32 hashvalue;
+ BufFile *innerFile = NULL;
+ BufFile *outerFile = NULL;
nbatch = hashtable->nbatch;
curbatch = hashtable->curbatch;
- if (curbatch > 0)
+ /*
+ * We no longer need the previous outer batch file; close it right away to
+ * free disk space.
+ */
+ if (hashtable->outerBatchFile && hashtable->outerBatchFile[curbatch])
{
- /*
- * We no longer need the previous outer batch file; close it right
- * away to free disk space.
- */
- if (hashtable->outerBatchFile[curbatch])
- BufFileClose(hashtable->outerBatchFile[curbatch]);
+ BufFileClose(hashtable->outerBatchFile[curbatch]);
hashtable->outerBatchFile[curbatch] = NULL;
}
- else /* we just finished the first batch */
+ if (IsHashloopFallback(hashtable))
+ {
+ BufFileClose(hashtable->hashloopBatchFile[curbatch]);
+ hashtable->hashloopBatchFile[curbatch] = NULL;
+ }
+
+ /*
+ * We are surely done with the inner batch file now
+ */
+ if (hashtable->innerBatchFile && hashtable->innerBatchFile[curbatch])
+ {
+ BufFileClose(hashtable->innerBatchFile[curbatch]);
+ hashtable->innerBatchFile[curbatch] = NULL;
+ }
+
+ if (curbatch == 0) /* we just finished the first batch */
{
/*
* Reset some of the skew optimization state variables, since we no
@@ -1030,55 +1241,168 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
return false; /* no more batches */
hashtable->curbatch = curbatch;
+ hashtable->curstripe = STRIPE_DETACHED;
+ hjstate->hj_CurNumOuterTuples = 0;
- /*
- * Reload the hash table with the new inner batch (which could be empty)
- */
- ExecHashTableReset(hashtable);
+ if (hashtable->innerBatchFile && hashtable->innerBatchFile[curbatch])
+ innerFile = hashtable->innerBatchFile[curbatch];
+
+ if (innerFile && BufFileSeek(innerFile, 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file: %m")));
+
+ /* Need to rewind outer when this is the first stripe of a new batch */
+ if (hashtable->outerBatchFile && hashtable->outerBatchFile[curbatch])
+ outerFile = hashtable->outerBatchFile[curbatch];
+
+ if (outerFile && BufFileSeek(outerFile, 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file: %m")));
+
+ ExecHashJoinLoadStripe(hjstate);
+ return true;
+}
- innerFile = hashtable->innerBatchFile[curbatch];
+static inline void
+InstrIncrBatchStripes(List *fallback_batches_stats, int curbatch)
+{
+ ListCell *lc;
- if (innerFile != NULL)
+ foreach(lc, fallback_batches_stats)
{
- if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not rewind hash-join temporary file")));
+ FallbackBatchStats *fallback_batch_stats = lfirst(lc);
- while ((slot = ExecHashJoinGetSavedTuple(hjstate,
- innerFile,
- &hashvalue,
- hjstate->hj_HashTupleSlot)))
+ if (fallback_batch_stats->batchno == curbatch)
{
- /*
- * NOTE: some tuples may be sent to future batches. Also, it is
- * possible for hashtable->nbatch to be increased here!
- */
- ExecHashTableInsert(hashtable, slot, hashvalue);
+ fallback_batch_stats->numstripes++;
+ break;
}
-
- /*
- * after we build the hash table, the inner batch file is no longer
- * needed
- */
- BufFileClose(innerFile);
- hashtable->innerBatchFile[curbatch] = NULL;
}
+}
+
+static inline void
+InstrAppendParallelBatchStripes(List **fallback_batches_stats, int curbatch, int nstripes)
+{
+ FallbackBatchStats *fallback_batch_stats;
+
+ fallback_batch_stats = palloc(sizeof(FallbackBatchStats));
+ fallback_batch_stats->batchno = curbatch;
+ /* Display the total number of stripes as a 1-indexed number */
+ fallback_batch_stats->numstripes = nstripes + 1;
+ *fallback_batches_stats = lappend(*fallback_batches_stats, fallback_batch_stats);
+}
+
+/*
+ * Returns false when the inner batch file is exhausted
+ */
+static int
+ExecHashJoinLoadStripe(HashJoinState *hjstate)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int curbatch = hashtable->curbatch;
+ TupleTableSlot *slot;
+ uint32 hashvalue;
+ bool loaded_inner = false;
+
+ if (hashtable->curstripe == PHANTOM_STRIPE)
+ return false;
/*
* Rewind outer batch file (if present), so that we can start reading it.
+ * TODO: This is only necessary if this is not the first stripe of the
+ * batch
*/
- if (hashtable->outerBatchFile[curbatch] != NULL)
+ if (hashtable->outerBatchFile && hashtable->outerBatchFile[curbatch])
{
if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not rewind hash-join temporary file")));
+ errmsg("could not rewind hash-join temporary file: %m")));
+ }
+ if (hashtable->innerBatchFile && hashtable->innerBatchFile[curbatch] && hashtable->curbatch == 0 && hashtable->curstripe == 0)
+ {
+ if (BufFileSeek(hashtable->innerBatchFile[curbatch], 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file: %m")));
}
- return true;
+ hashtable->curstripe++;
+
+ if (!hashtable->innerBatchFile || !hashtable->innerBatchFile[curbatch])
+ return false;
+
+ /*
+ * Reload the hash table with the new inner stripe
+ */
+ ExecHashTableReset(hashtable);
+
+ while ((slot = ExecHashJoinGetSavedTuple(hjstate,
+ hashtable->innerBatchFile[curbatch],
+ &hashvalue,
+ hjstate->hj_HashTupleSlot)))
+ {
+ /*
+ * NOTE: some tuples may be sent to future batches. Also, it is
+ * possible for hashtable->nbatch to be increased here!
+ */
+ uint32 hashTupleSize;
+
+ /*
+ * TODO: wouldn't it be cool if this returned the size of the tuple
+ * inserted
+ */
+ ExecHashTableInsert(hashtable, slot, hashvalue);
+ loaded_inner = true;
+
+ if (!IsHashloopFallback(hashtable))
+ continue;
+
+ hashTupleSize = slot->tts_ops->get_minimal_tuple(slot)->t_len + HJTUPLE_OVERHEAD;
+
+ if (hashtable->spaceUsed + hashTupleSize +
+ hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
+ > hashtable->spaceAllowed)
+ break;
+ }
+
+ /*
+ * if we didn't load anything and it is a FOJ/LOJ fallback batch, we will
+ * transition to emit unmatched outer tuples next. we want to know how
+ * many tuples were in the batch in that case, so don't zero it out then
+ */
+
+ /*
+ * if we loaded anything into the hashtable or it is the phantom stripe,
+ * must proceed to probing
+ */
+ if (loaded_inner)
+ {
+ hjstate->hj_CurNumOuterTuples = 0;
+ InstrIncrBatchStripes(hashtable->fallback_batches_stats, curbatch);
+ return true;
+ }
+
+ if (IsHashloopFallback(hashtable) && HJ_FILL_OUTER(hjstate))
+ {
+ /*
+ * if we didn't load anything and it is a fallback batch, we will
+ * prepare to emit outer tuples during the phantom stripe probing
+ */
+ hashtable->curstripe = PHANTOM_STRIPE;
+ hjstate->hj_EmitOuterTupleId = 0;
+ hjstate->hj_CurOuterMatchStatus = 0;
+ BufFileSeek(hashtable->hashloopBatchFile[curbatch], 0, 0, SEEK_SET);
+ if (hashtable->outerBatchFile[curbatch])
+ BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET);
+ return true;
+ }
+ return false;
}
+
/*
* Choose a batch to work on, and attach to it. Returns true if successful,
* false if there are no more batches.
@@ -1101,11 +1425,24 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
/*
* If we were already attached to a batch, remember not to bother checking
* it again, and detach from it (possibly freeing the hash table if we are
- * last to detach).
+ * last to detach). curbatch is set when the batch_barrier phase is either
+ * PHJ_BATCH_LOADING or PHJ_BATCH_STRIPING (note that the
+ * PHJ_BATCH_LOADING case will fall through to the PHJ_BATCH_STRIPING
+ * case). The PHJ_BATCH_STRIPING case returns to the caller. So when this
+ * function is reentered with a curbatch >= 0 then we must be done
+ * probing.
*/
+
if (hashtable->curbatch >= 0)
{
- hashtable->batches[hashtable->curbatch].done = true;
+ ParallelHashJoinBatchAccessor *batch_accessor = &hashtable->batches[hashtable->curbatch];
+
+ if (IsHashloopFallback(hashtable))
+ {
+ InstrAppendParallelBatchStripes(&hashtable->fallback_batches_stats, hashtable->curbatch, batch_accessor->shared->nstripes);
+ sb_end_write(hashtable->batches[hashtable->curbatch].sba);
+ }
+ batch_accessor->done = PHJ_BATCH_ACCESSOR_DONE;
ExecHashTableDetachBatch(hashtable);
}
@@ -1119,13 +1456,8 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
hashtable->nbatch;
do
{
- uint32 hashvalue;
- MinimalTuple tuple;
- TupleTableSlot *slot;
-
- if (!hashtable->batches[batchno].done)
+ if (hashtable->batches[batchno].done != PHJ_BATCH_ACCESSOR_DONE)
{
- SharedTuplestoreAccessor *inner_tuples;
Barrier *batch_barrier =
&hashtable->batches[batchno].shared->batch_barrier;
@@ -1136,7 +1468,15 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
/* One backend allocates the hash table. */
if (BarrierArriveAndWait(batch_barrier,
WAIT_EVENT_HASH_BATCH_ELECT))
+ {
ExecParallelHashTableAlloc(hashtable, batchno);
+
+ /*
+ * one worker needs to 0 out the read_pages of all the
+ * participants in the new batch
+ */
+ sts_reinitialize(hashtable->batches[batchno].inner_tuples);
+ }
/* Fall through. */
case PHJ_BATCH_ALLOCATING:
@@ -1145,41 +1485,31 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
WAIT_EVENT_HASH_BATCH_ALLOCATE);
/* Fall through. */
- case PHJ_BATCH_LOADING:
- /* Start (or join in) loading tuples. */
- ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
- inner_tuples = hashtable->batches[batchno].inner_tuples;
- sts_begin_parallel_scan(inner_tuples);
- while ((tuple = sts_parallel_scan_next(inner_tuples,
- &hashvalue)))
- {
- ExecForceStoreMinimalTuple(tuple,
- hjstate->hj_HashTupleSlot,
- false);
- slot = hjstate->hj_HashTupleSlot;
- ExecParallelHashTableInsertCurrentBatch(hashtable, slot,
- hashvalue);
- }
- sts_end_parallel_scan(inner_tuples);
- BarrierArriveAndWait(batch_barrier,
- WAIT_EVENT_HASH_BATCH_LOAD);
- /* Fall through. */
+ case PHJ_BATCH_STRIPING:
- case PHJ_BATCH_PROBING:
+ ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
+ sts_begin_parallel_scan(hashtable->batches[batchno].inner_tuples);
+ if (hashtable->batches[batchno].shared->hashloop_fallback)
+ sb_initialize_accessor(hashtable->batches[hashtable->curbatch].sba,
+ sts_get_tuplenum(hashtable->batches[hashtable->curbatch].outer_tuples));
+ hashtable->curstripe = STRIPE_DETACHED;
+ if (ExecParallelHashJoinLoadStripe(hjstate))
+ return true;
/*
- * This batch is ready to probe. Return control to
- * caller. We stay attached to batch_barrier so that the
- * hash table stays alive until everyone's finished
- * probing it, but no participant is allowed to wait at
- * this barrier again (or else a deadlock could occur).
- * All attached participants must eventually call
- * BarrierArriveAndDetach() so that the final phase
- * PHJ_BATCH_DONE can be reached.
+ * ExecParallelHashJoinLoadStripe() will return false from
+ * here when no more work can be done by this worker on
+ * this batch. Until further optimized, this worker will
+ * have detached from the stripe_barrier and should close
+ * its outer match statuses bitmap and then detach from
+ * the batch. In order to reuse the code below, fall
+ * through, even though the phase will not have been
+ * advanced
*/
- ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
- sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples);
- return true;
+ if (hashtable->batches[batchno].shared->hashloop_fallback)
+ sb_end_write(hashtable->batches[batchno].sba);
+
+ /* Fall through. */
case PHJ_BATCH_DONE:
@@ -1187,8 +1517,16 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
* Already done. Detach and go around again (if any
* remain).
*/
+
+ /*
+ * In case the leader joins late, we have to make sure
+ * that all workers have the final number of stripes.
+ */
+ if (hashtable->batches[batchno].shared->hashloop_fallback)
+ InstrAppendParallelBatchStripes(&hashtable->fallback_batches_stats, batchno, hashtable->batches[batchno].shared->nstripes);
BarrierDetach(batch_barrier);
- hashtable->batches[batchno].done = true;
+ hashtable->batches[batchno].done = PHJ_BATCH_ACCESSOR_DONE;
+
hashtable->curbatch = -1;
break;
@@ -1203,6 +1541,244 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
return false;
}
+
+
+/*
+ * Returns true if ready to probe and false if the inner is exhausted
+ * (there are no more stripes)
+ */
+bool
+ExecParallelHashJoinLoadStripe(HashJoinState *hjstate)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int batchno = hashtable->curbatch;
+ ParallelHashJoinBatchAccessor *batch_accessor = &(hashtable->batches[batchno]);
+ ParallelHashJoinBatch *batch = batch_accessor->shared;
+ Barrier *stripe_barrier = &batch->stripe_barrier;
+ SharedTuplestoreAccessor *outer_tuples;
+ SharedTuplestoreAccessor *inner_tuples;
+
+ outer_tuples = hashtable->batches[batchno].outer_tuples;
+ inner_tuples = hashtable->batches[batchno].inner_tuples;
+
+ if (hashtable->curstripe >= 0)
+ {
+ /*
+ * If a worker is already attached to a stripe, wait until all
+ * participants have finished probing and detach. The last worker,
+ * however, can re-attach to the stripe_barrier and proceed to load
+ * and probe the other stripes
+ *
+ * After finishing with participating in a stripe, if a worker is the
+ * only one working on a batch, it will continue working on it.
+ * However, if a worker is not the only worker working on a batch, it
+ * would risk deadlock if it waits on the barrier. Instead, it will
+ * detach from the stripe, and, eventually the batch.
+ *
+ * This means all stripes after the first stripe will be executed
+ * serially. TODO: allow workers to provisionally detach from the
+ * batch and reattach later if there is still work to be done. I had a
+ * patch that did this. Workers who were not the last worker saved the
+ * state of the stripe barrier upon detaching and then mark the batch
+ * as "provisionally" done (not done). Later, when the worker comes
+ * back to the batch in the batch phase machine, if the batch is not
+ * complete and the phase has advanced since the worker was last
+ * participating, then the worker can join back in. This had problems.
+ * There were synchronization issues with workers having multiple
+ * outer match status bitmap files open at the same time, so, I had
+ * workers close their bitmap and make a new one the next time they
+ * joined in. This didn't work with the current code because the
+ * original outer match status bitmap file that the worker had created
+ * while probing stripe 1 did not get combined into the combined
+ * bitmap This could be specifically fixed, but I think it is better
+ * to address the lack of parallel execution for stripes after stripe
+ * 0 more holistically.
+ */
+ if (!BarrierArriveAndDetach(stripe_barrier))
+ {
+ sb_end_write(batch_accessor->sba);
+ hashtable->curstripe = STRIPE_DETACHED;
+ return false;
+ }
+
+ /*
+ * This isn't a race condition if no other workers can stay attached
+ * to this barrier in the intervening time. Basically, if you attach
+ * to a stripe barrier in the PHJ_STRIPE_DONE phase, detach
+ * immediately and move on.
+ */
+ BarrierAttach(stripe_barrier);
+ }
+ else if (hashtable->curstripe == STRIPE_DETACHED)
+ {
+ int phase = BarrierAttach(stripe_barrier);
+
+ /*
+ * If a worker enters this phase machine for the first time for this
+ * batch on a stripe number greater than the batch's maximum stripe
+ * number, then: 1) The batch is done, or 2) The batch is on the
+ * phantom stripe that's used for hashloop fallback. Either way the
+ * worker can't contribute, so it will just detach and move on.
+ */
+ if (PHJ_STRIPE_NUMBER(phase) > batch->nstripes ||
+ PHJ_STRIPE_PHASE(phase) == PHJ_STRIPE_DONE)
+ return ExecHashTableDetachStripe(hashtable);
+ }
+ else if (hashtable->curstripe == PHANTOM_STRIPE)
+ {
+ /* Only the last worker will execute this code. */
+ sts_end_parallel_scan(outer_tuples);
+
+ /*
+ * TODO: ideally this would go somewhere in the batch phase machine
+ * Putting it in ExecHashTableDetachBatch didn't do the trick
+ */
+ sb_end_read(batch_accessor->sba);
+ return ExecHashTableDetachStripe(hashtable);
+ }
+
+ hashtable->curstripe = PHJ_STRIPE_NUMBER(BarrierPhase(stripe_barrier));
+
+ /*
+ * The outer side is exhausted and either 1) the current stripe of the
+ * inner side is exhausted and it is time to advance the stripe 2) the
+ * last stripe of the inner side is exhausted and it is time to advance
+ * the batch
+ */
+ for (;;)
+ {
+ MinimalTuple tuple;
+ tupleMetadata metadata;
+
+ bool overflow_required = false;
+ int phase = BarrierPhase(stripe_barrier);
+
+ switch (PHJ_STRIPE_PHASE(phase))
+ {
+ case PHJ_STRIPE_ELECTING:
+ if (BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_ELECT))
+ sts_reinitialize(outer_tuples);
+ /* FALLTHROUGH */
+ case PHJ_STRIPE_RESETTING:
+
+ /*
+ * This barrier allows the elected worker to finish resetting
+ * the read_page for the outer side as well as allowing the
+ * worker which was elected to clear out the hashtable from
+ * the last stripe to finish.
+ */
+ BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_RESET);
+ /* FALLTHROUGH */
+ case PHJ_STRIPE_LOADING:
+
+ /*
+ * Start (or join in) loading the next stripe of inner tuples.
+ */
+ sts_begin_parallel_scan(inner_tuples);
+
+ /*
+ * TODO: add functionality to pre-alloc some memory before
+ * calling sts_parallel_scan_next() because that will reserve
+ * an additional STS_CHUNK for every stripe for each worker
+ * that won't fit, so we should first see if the chunk would
+ * fit before getting the assignment
+ */
+ while ((tuple = sts_parallel_scan_next(inner_tuples, &metadata)))
+ {
+ ExecForceStoreMinimalTuple(tuple, hjstate->hj_HashTupleSlot, false);
+ if (!ExecParallelHashTableInsertCurrentBatch(hashtable, hjstate->hj_HashTupleSlot, metadata.hashvalue, sta_get_read_participant(inner_tuples)))
+ {
+ overflow_required = true;
+ pg_atomic_test_set_flag(&batch->overflow_required);
+ break;
+ }
+ }
+
+ if (BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_LOAD))
+ {
+ if (!pg_atomic_unlocked_test_flag(&batch->overflow_required))
+ batch->nstripes++;
+ }
+ /* FALLTHROUGH */
+ case PHJ_STRIPE_OVERFLOWING:
+ if (overflow_required)
+ {
+ Assert(tuple);
+ sts_spill_leftover_tuples(inner_tuples, tuple, metadata.hashvalue);
+ }
+ BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_OVERFLOW);
+
+ /* FALLTHROUGH */
+ case PHJ_STRIPE_PROBING:
+ {
+ /*
+ * do this again here in case a worker began the scan and
+ * then entered after loading before probing
+ */
+ sts_end_parallel_scan(inner_tuples);
+ sts_begin_parallel_scan(outer_tuples);
+ return true;
+ }
+
+ case PHJ_STRIPE_DONE:
+ if (PHJ_STRIPE_NUMBER(phase) >= batch->nstripes)
+ {
+ /*
+ * Handle the phantom stripe case.
+ */
+ if (batch->hashloop_fallback && HJ_FILL_OUTER(hjstate))
+ goto fallback_stripe;
+
+ /* Return if this is the last stripe */
+ return ExecHashTableDetachStripe(hashtable);
+ }
+
+ /* this, effectively, increments the stripe number */
+ if (BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_LOAD))
+ {
+ ExecParallelHashTableRecycle(hashtable);
+ pg_atomic_clear_flag(&batch->overflow_required);
+ }
+
+ hashtable->curstripe++;
+ continue;
+
+ default:
+ elog(ERROR, "unexpected stripe phase %d. pid %i. batch %i.", BarrierPhase(stripe_barrier), MyProcPid, batchno);
+ }
+ }
+
+fallback_stripe:
+ sb_end_write(batch_accessor->sba);
+
+ /* Ensure that only a single worker is attached to the barrier */
+ if (!BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_LOAD))
+ return ExecHashTableDetachStripe(hashtable);
+
+ /* No one except the last worker will run this code */
+ hashtable->curstripe = PHANTOM_STRIPE;
+
+ ExecParallelHashTableRecycle(hashtable);
+ pg_atomic_clear_flag(&batch->overflow_required);
+
+ /*
+ * If all workers (including this one) have finished probing the batch,
+ * one worker is elected to Loop through the outer match status files from
+ * all workers that were attached to this batch Combine them into one
+ * bitmap Use the bitmap, loop through the outer batch file again, and
+ * emit unmatched tuples All workers will detach from the batch barrier
+ * and the last worker will clean up the hashtable. All workers except the
+ * last worker will end their scans of the outer and inner side. The last
+ * worker will end its scan of the inner side
+ */
+ sb_combine(batch_accessor->sba);
+ sts_reinitialize(outer_tuples);
+
+ sts_begin_parallel_scan(outer_tuples);
+
+ return true;
+}
+
/*
* ExecHashJoinSaveTuple
* save a tuple to a batch file.
@@ -1364,6 +1940,9 @@ ExecReScanHashJoin(HashJoinState *node)
node->hj_MatchedOuter = false;
node->hj_FirstOuterTupleSlot = NULL;
+ node->hj_CurNumOuterTuples = 0;
+ node->hj_CurOuterMatchStatus = 0;
+
/*
* if chgParam of subnode is not null then plan will be re-scanned by
* first ExecProcNode.
@@ -1394,7 +1973,6 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate)
ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
HashJoinTable hashtable = hjstate->hj_HashTable;
TupleTableSlot *slot;
- uint32 hashvalue;
int i;
Assert(hjstate->hj_FirstOuterTupleSlot == NULL);
@@ -1402,6 +1980,8 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate)
/* Execute outer plan, writing all tuples to shared tuplestores. */
for (;;)
{
+ tupleMetadata metadata;
+
slot = ExecProcNode(outerState);
if (TupIsNull(slot))
break;
@@ -1410,17 +1990,25 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate)
hjstate->hj_OuterHashKeys,
true, /* outer tuple */
HJ_FILL_OUTER(hjstate),
- &hashvalue))
+ &metadata.hashvalue))
{
int batchno;
int bucketno;
bool shouldFree;
+ SharedTuplestoreAccessor *accessor;
+
MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree);
- ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
+ ExecHashGetBucketAndBatch(hashtable, metadata.hashvalue, &bucketno,
&batchno);
+ accessor = hashtable->batches[batchno].outer_tuples;
+
+ /* cannot count on deterministic order of tupleids */
+ metadata.tupleid = sts_increment_ntuples(accessor);
+
sts_puttuple(hashtable->batches[batchno].outer_tuples,
- &hashvalue, mintup);
+ &metadata.hashvalue,
+ mintup);
if (shouldFree)
heap_free_minimal_tuple(mintup);
@@ -1481,6 +2069,8 @@ ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt)
LWLockInitialize(&pstate->lock,
LWTRANCHE_PARALLEL_HASH_JOIN);
BarrierInit(&pstate->build_barrier, 0);
+ BarrierInit(&pstate->eviction_barrier, 0);
+ BarrierInit(&pstate->repartition_barrier, 0);
BarrierInit(&pstate->grow_batches_barrier, 0);
BarrierInit(&pstate->grow_buckets_barrier, 0);
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 38c2fc0b50b66..9a7962518ee69 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -471,7 +471,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
{
SubPlan *subplan = node->subplan;
PlanState *planstate = node->planstate;
- int ncols = list_length(subplan->paramIds);
+ int ncols = node->numCols;
ExprContext *innerecontext = node->innerecontext;
MemoryContext oldcontext;
long nbuckets;
@@ -878,11 +878,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
ALLOCSET_SMALL_SIZES);
/* and a short-lived exprcontext for function evaluation */
sstate->innerecontext = CreateExprContext(estate);
- /* Silly little array of column numbers 1..n */
- ncols = list_length(subplan->paramIds);
- sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber));
- for (i = 0; i < ncols; i++)
- sstate->keyColIdx[i] = i + 1;
/*
* We use ExecProject to evaluate the lefthand and righthand
@@ -914,13 +909,15 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
(int) nodeTag(subplan->testexpr));
oplist = NIL; /* keep compiler quiet */
}
- Assert(list_length(oplist) == ncols);
+ ncols = list_length(oplist);
lefttlist = righttlist = NIL;
+ sstate->numCols = ncols;
+ sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber));
sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
+ sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
sstate->tab_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
- sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
/* we'll need the cross-type equality fns below, but not in sstate */
@@ -979,6 +976,9 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
/* Set collation */
sstate->tab_collations[i - 1] = opexpr->inputcollid;
+ /* keyColIdx is just column numbers 1..n */
+ sstate->keyColIdx[i - 1] = i;
+
i++;
}
diff --git a/src/backend/jit/README b/src/backend/jit/README
index e2fac8558e8e3..5427bdf2153ff 100644
--- a/src/backend/jit/README
+++ b/src/backend/jit/README
@@ -10,11 +10,11 @@ SQL expressions to evaluate an SQL predicate like WHERE a.col = 3, it
is possible to generate a function than can be natively executed by
the CPU that just handles that expression, yielding a speedup.
-That this is done at query execution time, possibly even only in cases
-where the relevant task is done a number of times, makes it JIT,
-rather than ahead-of-time (AOT). Given the way JIT compilation is used
-in PostgreSQL, the lines between interpretation, AOT and JIT are
-somewhat blurry.
+This is JIT, rather than ahead-of-time (AOT) compilation, because it
+is done at query execution time, and perhaps only in cases where the
+relevant task is repeated a number of times. Given the way JIT
+compilation is used in PostgreSQL, the lines between interpretation,
+AOT and JIT are somewhat blurry.
Note that the interpreted program turned into a native program does
not necessarily have to be a program in the classical sense. E.g. it
@@ -99,7 +99,7 @@ Lifetimes of JITed functions are managed via JITContext. Exactly one
such context should be created for work in which all created JITed
function should have the same lifetime. E.g. there's exactly one
JITContext for each query executed, in the query's EState. Only the
-release of an JITContext is exposed to the provider independent
+release of a JITContext is exposed to the provider independent
facility, as the creation of one is done on-demand by the JIT
implementations.
@@ -231,7 +231,7 @@ needs to be referenced as an offset to one block of memory stored in
an ExprState, rather than absolute pointers into memory.
Once that is addressed, adding an LRU cache that's keyed by the
-generated LLVM IR will allow to use optimized functions even for
+generated LLVM IR will allow the usage of optimized functions even for
faster queries.
A longer term project is to move expression compilation to the planner
diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index d85ca9f7c5010..9ce8f43385ec8 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -575,27 +575,76 @@ exprIsLengthCoercion(const Node *expr, int32 *coercedTypmod)
return false;
}
+/*
+ * applyRelabelType
+ * Add a RelabelType node if needed to make the expression expose
+ * the specified type, typmod, and collation.
+ *
+ * This is primarily intended to be used during planning. Therefore, it must
+ * maintain the post-eval_const_expressions invariants that there are not
+ * adjacent RelabelTypes, and that the tree is fully const-folded (hence,
+ * we mustn't return a RelabelType atop a Const). If we do find a Const,
+ * we'll modify it in-place if "overwrite_ok" is true; that should only be
+ * passed as true if caller knows the Const is newly generated.
+ */
+Node *
+applyRelabelType(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid,
+ CoercionForm rformat, int rlocation, bool overwrite_ok)
+{
+ /*
+ * If we find stacked RelabelTypes (eg, from foo::int::oid) we can discard
+ * all but the top one, and must do so to ensure that semantically
+ * equivalent expressions are equal().
+ */
+ while (arg && IsA(arg, RelabelType))
+ arg = (Node *) ((RelabelType *) arg)->arg;
+
+ if (arg && IsA(arg, Const))
+ {
+ /* Modify the Const directly to preserve const-flatness. */
+ Const *con = (Const *) arg;
+
+ if (!overwrite_ok)
+ con = copyObject(con);
+ con->consttype = rtype;
+ con->consttypmod = rtypmod;
+ con->constcollid = rcollid;
+ /* We keep the Const's original location. */
+ return (Node *) con;
+ }
+ else if (exprType(arg) == rtype &&
+ exprTypmod(arg) == rtypmod &&
+ exprCollation(arg) == rcollid)
+ {
+ /* Sometimes we find a nest of relabels that net out to nothing. */
+ return arg;
+ }
+ else
+ {
+ /* Nope, gotta have a RelabelType. */
+ RelabelType *newrelabel = makeNode(RelabelType);
+
+ newrelabel->arg = (Expr *) arg;
+ newrelabel->resulttype = rtype;
+ newrelabel->resulttypmod = rtypmod;
+ newrelabel->resultcollid = rcollid;
+ newrelabel->relabelformat = rformat;
+ newrelabel->location = rlocation;
+ return (Node *) newrelabel;
+ }
+}
+
/*
* relabel_to_typmod
* Add a RelabelType node that changes just the typmod of the expression.
*
- * This is primarily intended to be used during planning. Therefore, it
- * strips any existing RelabelType nodes to maintain the planner's invariant
- * that there are not adjacent RelabelTypes.
+ * Convenience function for a common usage of applyRelabelType.
*/
Node *
relabel_to_typmod(Node *expr, int32 typmod)
{
- Oid type = exprType(expr);
- Oid coll = exprCollation(expr);
-
- /* Strip any existing RelabelType node(s) */
- while (expr && IsA(expr, RelabelType))
- expr = (Node *) ((RelabelType *) expr)->arg;
-
- /* Apply new typmod, preserving the previous exposed type and collation */
- return (Node *) makeRelabelType((Expr *) expr, type, typmod, coll,
- COERCE_EXPLICIT_CAST);
+ return applyRelabelType(expr, exprType(expr), typmod, exprCollation(expr),
+ COERCE_EXPLICIT_CAST, -1, false);
}
/*
diff --git a/src/backend/nodes/params.c b/src/backend/nodes/params.c
index 1719119fc28fb..bce0c7e72b2c5 100644
--- a/src/backend/nodes/params.c
+++ b/src/backend/nodes/params.c
@@ -16,6 +16,7 @@
#include "postgres.h"
#include "access/xact.h"
+#include "fmgr.h"
#include "mb/stringinfo_mb.h"
#include "nodes/params.h"
#include "parser/parse_node.h"
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 6da0dcd61cecd..b399592ff8150 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -912,7 +912,11 @@ set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
/* ... but do not let it set the rows estimate to zero */
rel->rows = clamp_row_est(rel->rows);
- /* also, make sure rel->tuples is not insane relative to rel->rows */
+ /*
+ * Also, make sure rel->tuples is not insane relative to rel->rows.
+ * Notably, this ensures sanity if pg_class.reltuples contains -1 and the
+ * FDW doesn't do anything to replace that.
+ */
rel->tuples = Max(rel->tuples, rel->rows);
}
@@ -3182,6 +3186,17 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels)
* volatile qual could succeed for some SRF output rows and fail for others,
* a behavior that cannot occur if it's evaluated before SRF expansion.
*
+ * 6. If the subquery has nonempty grouping sets, we cannot push down any
+ * quals. The concern here is that a qual referencing a "constant" grouping
+ * column could get constant-folded, which would be improper because the value
+ * is potentially nullable by grouping-set expansion. This restriction could
+ * be removed if we had a parsetree representation that shows that such
+ * grouping columns are not really constant. (There are other ideas that
+ * could be used to relax this restriction, but that's the approach most
+ * likely to get taken in the future. Note that there's not much to be gained
+ * so long as subquery_planner can't move HAVING clauses to WHERE within such
+ * a subquery.)
+ *
* In addition, we make several checks on the subquery's output columns to see
* if it is safe to reference them in pushed-down quals. If output column k
* is found to be unsafe to reference, we set safetyInfo->unsafeColumns[k]
@@ -3226,6 +3241,10 @@ subquery_is_pushdown_safe(Query *subquery, Query *topquery,
if (subquery->limitOffset != NULL || subquery->limitCount != NULL)
return false;
+ /* Check point 6 */
+ if (subquery->groupClause && subquery->groupingSets)
+ return false;
+
/* Check points 3, 4, and 5 */
if (subquery->distinctClause ||
subquery->hasWindowFuncs ||
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index a3ebe10592d0e..37a735b06bba6 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -164,8 +164,7 @@ clauselist_selectivity_simple(PlannerInfo *root,
* directly to clause_selectivity(). None of what we might do below is
* relevant.
*/
- if ((list_length(clauses) == 1) &&
- bms_num_members(estimatedclauses) == 0)
+ if (list_length(clauses) == 1 && bms_is_empty(estimatedclauses))
return clause_selectivity(root, (Node *) linitial(clauses),
varRelid, jointype, sjinfo);
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index b99cec00cb7a6..b68a5a0ec7171 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -490,10 +490,6 @@ process_equivalence(PlannerInfo *root,
* work to not label the collation at all in EC members, but this is risky
* since some parts of the system expect exprCollation() to deliver the
* right answer for a sort key.)
- *
- * Note this code assumes that the expression has already been through
- * eval_const_expressions, so there are no CollateExprs and no redundant
- * RelabelTypes.
*/
Expr *
canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation)
@@ -514,29 +510,24 @@ canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation)
exprCollation((Node *) expr) != req_collation)
{
/*
- * Strip any existing RelabelType, then add a new one if needed. This
- * is to preserve the invariant of no redundant RelabelTypes.
- *
- * If we have to change the exposed type of the stripped expression,
- * set typmod to -1 (since the new type may not have the same typmod
- * interpretation). If we only have to change collation, preserve the
- * exposed typmod.
+ * If we have to change the type of the expression, set typmod to -1,
+ * since the new type may not have the same typmod interpretation.
+ * When we only have to change collation, preserve the exposed typmod.
+ */
+ int32 req_typmod;
+
+ if (expr_type != req_type)
+ req_typmod = -1;
+ else
+ req_typmod = exprTypmod((Node *) expr);
+
+ /*
+ * Use applyRelabelType so that we preserve const-flatness. This is
+ * important since eval_const_expressions has already been applied.
*/
- while (expr && IsA(expr, RelabelType))
- expr = (Expr *) ((RelabelType *) expr)->arg;
-
- if (exprType((Node *) expr) != req_type)
- expr = (Expr *) makeRelabelType(expr,
- req_type,
- -1,
- req_collation,
- COERCE_IMPLICIT_CAST);
- else if (exprCollation((Node *) expr) != req_collation)
- expr = (Expr *) makeRelabelType(expr,
- req_type,
- exprTypmod((Node *) expr),
- req_collation,
- COERCE_IMPLICIT_CAST);
+ expr = (Expr *) applyRelabelType((Node *) expr,
+ req_type, req_typmod, req_collation,
+ COERCE_IMPLICIT_CAST, -1, false);
}
return expr;
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 9a8f738c9d05b..6eb794669fe35 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -69,7 +69,7 @@ typedef struct inline_cte_walker_context
static Node *build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
List *plan_params,
SubLinkType subLinkType, int subLinkId,
- Node *testexpr, bool adjust_testexpr,
+ Node *testexpr, List *testexpr_paramids,
bool unknownEqFalse);
static List *generate_subquery_params(PlannerInfo *root, List *tlist,
List **paramIds);
@@ -81,7 +81,8 @@ static Node *convert_testexpr(PlannerInfo *root,
static Node *convert_testexpr_mutator(Node *node,
convert_testexpr_context *context);
static bool subplan_is_hashable(Plan *plan);
-static bool testexpr_is_hashable(Node *testexpr);
+static bool testexpr_is_hashable(Node *testexpr, List *param_ids);
+static bool test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids);
static bool hash_ok_operator(OpExpr *expr);
static bool contain_dml(Node *node);
static bool contain_dml_walker(Node *node, void *context);
@@ -237,7 +238,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
/* And convert to SubPlan or InitPlan format. */
result = build_subplan(root, plan, subroot, plan_params,
subLinkType, subLinkId,
- testexpr, true, isTopQual);
+ testexpr, NIL, isTopQual);
/*
* If it's a correlated EXISTS with an unimportant targetlist, we might be
@@ -291,12 +292,11 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
plan_params,
ANY_SUBLINK, 0,
newtestexpr,
- false, true));
+ paramIds,
+ true));
/* Check we got what we expected */
Assert(hashplan->parParam == NIL);
Assert(hashplan->useHashTable);
- /* build_subplan won't have filled in paramIds */
- hashplan->paramIds = paramIds;
/* Leave it to the executor to decide which plan to use */
asplan = makeNode(AlternativeSubPlan);
@@ -319,7 +319,7 @@ static Node *
build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
List *plan_params,
SubLinkType subLinkType, int subLinkId,
- Node *testexpr, bool adjust_testexpr,
+ Node *testexpr, List *testexpr_paramids,
bool unknownEqFalse)
{
Node *result;
@@ -484,10 +484,10 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
else
{
/*
- * Adjust the Params in the testexpr, unless caller said it's not
- * needed.
+ * Adjust the Params in the testexpr, unless caller already took care
+ * of it (as indicated by passing a list of Param IDs).
*/
- if (testexpr && adjust_testexpr)
+ if (testexpr && testexpr_paramids == NIL)
{
List *params;
@@ -499,7 +499,10 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
params);
}
else
+ {
splan->testexpr = testexpr;
+ splan->paramIds = testexpr_paramids;
+ }
/*
* We can't convert subplans of ALL_SUBLINK or ANY_SUBLINK types to
@@ -511,7 +514,7 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
if (subLinkType == ANY_SUBLINK &&
splan->parParam == NIL &&
subplan_is_hashable(plan) &&
- testexpr_is_hashable(splan->testexpr))
+ testexpr_is_hashable(splan->testexpr, splan->paramIds))
splan->useHashTable = true;
/*
@@ -734,24 +737,20 @@ subplan_is_hashable(Plan *plan)
/*
* testexpr_is_hashable: is an ANY SubLink's test expression hashable?
+ *
+ * To identify LHS vs RHS of the hash expression, we must be given the
+ * list of output Param IDs of the SubLink's subquery.
*/
static bool
-testexpr_is_hashable(Node *testexpr)
+testexpr_is_hashable(Node *testexpr, List *param_ids)
{
/*
* The testexpr must be a single OpExpr, or an AND-clause containing only
- * OpExprs.
- *
- * The combining operators must be hashable and strict. The need for
- * hashability is obvious, since we want to use hashing. Without
- * strictness, behavior in the presence of nulls is too unpredictable. We
- * actually must assume even more than plain strictness: they can't yield
- * NULL for non-null inputs, either (see nodeSubplan.c). However, hash
- * indexes and hash joins assume that too.
+ * OpExprs, each of which satisfy test_opexpr_is_hashable().
*/
if (testexpr && IsA(testexpr, OpExpr))
{
- if (hash_ok_operator((OpExpr *) testexpr))
+ if (test_opexpr_is_hashable((OpExpr *) testexpr, param_ids))
return true;
}
else if (is_andclause(testexpr))
@@ -764,7 +763,7 @@ testexpr_is_hashable(Node *testexpr)
if (!IsA(andarg, OpExpr))
return false;
- if (!hash_ok_operator((OpExpr *) andarg))
+ if (!test_opexpr_is_hashable((OpExpr *) andarg, param_ids))
return false;
}
return true;
@@ -773,6 +772,40 @@ testexpr_is_hashable(Node *testexpr)
return false;
}
+static bool
+test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids)
+{
+ /*
+ * The combining operator must be hashable and strict. The need for
+ * hashability is obvious, since we want to use hashing. Without
+ * strictness, behavior in the presence of nulls is too unpredictable. We
+ * actually must assume even more than plain strictness: it can't yield
+ * NULL for non-null inputs, either (see nodeSubplan.c). However, hash
+ * indexes and hash joins assume that too.
+ */
+ if (!hash_ok_operator(testexpr))
+ return false;
+
+ /*
+ * The left and right inputs must belong to the outer and inner queries
+ * respectively; hence Params that will be supplied by the subquery must
+ * not appear in the LHS, and Vars of the outer query must not appear in
+ * the RHS. (Ordinarily, this must be true because of the way that the
+ * parser builds an ANY SubLink's testexpr ... but inlining of functions
+ * could have changed the expression's structure, so we have to check.
+ * Such cases do not occur often enough to be worth trying to optimize, so
+ * we don't worry about trying to commute the clause or anything like
+ * that; we just need to be sure not to build an invalid plan.)
+ */
+ if (list_length(testexpr->args) != 2)
+ return false;
+ if (contain_exec_param((Node *) linitial(testexpr->args), param_ids))
+ return false;
+ if (contain_var_clause((Node *) lsecond(testexpr->args)))
+ return false;
+ return true;
+}
+
/*
* Check expression is hashable + strict
*
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 2ebd4ea332071..745f443e5c2df 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1200,13 +1200,9 @@ generate_setop_tlist(List *colTypes, List *colCollations,
* will reach the executor without any further processing.
*/
if (exprCollation(expr) != colColl)
- {
- expr = (Node *) makeRelabelType((Expr *) expr,
- exprType(expr),
- exprTypmod(expr),
- colColl,
- COERCE_IMPLICIT_CAST);
- }
+ expr = applyRelabelType(expr,
+ exprType(expr), exprTypmod(expr), colColl,
+ COERCE_IMPLICIT_CAST, -1, false);
tle = makeTargetEntry((Expr *) expr,
(AttrNumber) resno++,
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index e04b144072369..750586fceb746 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -108,6 +108,7 @@ static bool contain_volatile_functions_not_nextval_walker(Node *node, void *cont
static bool max_parallel_hazard_walker(Node *node,
max_parallel_hazard_context *context);
static bool contain_nonstrict_functions_walker(Node *node, void *context);
+static bool contain_exec_param_walker(Node *node, List *param_ids);
static bool contain_context_dependent_node(Node *clause);
static bool contain_context_dependent_node_walker(Node *node, int *flags);
static bool contain_leaked_vars_walker(Node *node, void *context);
@@ -119,9 +120,6 @@ static Node *eval_const_expressions_mutator(Node *node,
static bool contain_non_const_walker(Node *node, void *context);
static bool ece_function_is_safe(Oid funcid,
eval_const_expressions_context *context);
-static Node *apply_const_relabel(Node *arg, Oid rtype,
- int32 rtypmod, Oid rcollid,
- CoercionForm rformat, int rlocation);
static List *simplify_or_arguments(List *args,
eval_const_expressions_context *context,
bool *haveNull, bool *forceTrue);
@@ -1221,6 +1219,40 @@ contain_nonstrict_functions_walker(Node *node, void *context)
context);
}
+/*****************************************************************************
+ * Check clauses for Params
+ *****************************************************************************/
+
+/*
+ * contain_exec_param
+ * Recursively search for PARAM_EXEC Params within a clause.
+ *
+ * Returns true if the clause contains any PARAM_EXEC Param with a paramid
+ * appearing in the given list of Param IDs. Does not descend into
+ * subqueries!
+ */
+bool
+contain_exec_param(Node *clause, List *param_ids)
+{
+ return contain_exec_param_walker(clause, param_ids);
+}
+
+static bool
+contain_exec_param_walker(Node *node, List *param_ids)
+{
+ if (node == NULL)
+ return false;
+ if (IsA(node, Param))
+ {
+ Param *p = (Param *) node;
+
+ if (p->paramkind == PARAM_EXEC &&
+ list_member_int(param_ids, p->paramid))
+ return true;
+ }
+ return expression_tree_walker(node, contain_exec_param_walker, param_ids);
+}
+
/*****************************************************************************
* Check clauses for context-dependent nodes
*****************************************************************************/
@@ -2784,12 +2816,13 @@ eval_const_expressions_mutator(Node *node,
arg = eval_const_expressions_mutator((Node *) relabel->arg,
context);
/* ... and attach a new RelabelType node, if needed */
- return apply_const_relabel(arg,
- relabel->resulttype,
- relabel->resulttypmod,
- relabel->resultcollid,
- relabel->relabelformat,
- relabel->location);
+ return applyRelabelType(arg,
+ relabel->resulttype,
+ relabel->resulttypmod,
+ relabel->resultcollid,
+ relabel->relabelformat,
+ relabel->location,
+ true);
}
case T_CoerceViaIO:
{
@@ -2936,12 +2969,13 @@ eval_const_expressions_mutator(Node *node,
arg = eval_const_expressions_mutator((Node *) collate->arg,
context);
/* ... and attach a new RelabelType node, if needed */
- return apply_const_relabel(arg,
- exprType(arg),
- exprTypmod(arg),
- collate->collOid,
- COERCE_IMPLICIT_CAST,
- collate->location);
+ return applyRelabelType(arg,
+ exprType(arg),
+ exprTypmod(arg),
+ collate->collOid,
+ COERCE_IMPLICIT_CAST,
+ collate->location,
+ true);
}
case T_CaseExpr:
{
@@ -3443,12 +3477,13 @@ eval_const_expressions_mutator(Node *node,
cdomain->resulttype);
/* Generate RelabelType to substitute for CoerceToDomain */
- return apply_const_relabel(arg,
- cdomain->resulttype,
- cdomain->resulttypmod,
- cdomain->resultcollid,
- cdomain->coercionformat,
- cdomain->location);
+ return applyRelabelType(arg,
+ cdomain->resulttype,
+ cdomain->resulttypmod,
+ cdomain->resultcollid,
+ cdomain->coercionformat,
+ cdomain->location,
+ true);
}
newcdomain = makeNode(CoerceToDomain);
@@ -3581,58 +3616,6 @@ ece_function_is_safe(Oid funcid, eval_const_expressions_context *context)
return false;
}
-/*
- * Subroutine for eval_const_expressions: apply RelabelType if needed
- */
-static Node *
-apply_const_relabel(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid,
- CoercionForm rformat, int rlocation)
-{
- /*
- * If we find stacked RelabelTypes (eg, from foo::int::oid) we can discard
- * all but the top one, and must do so to ensure that semantically
- * equivalent expressions are equal().
- */
- while (arg && IsA(arg, RelabelType))
- arg = (Node *) ((RelabelType *) arg)->arg;
-
- if (arg && IsA(arg, Const))
- {
- /*
- * If it's a Const, just modify it in-place; since this is part of
- * eval_const_expressions, we want to end up with a simple Const not
- * an expression tree. We assume the Const is newly generated and
- * hence safe to modify.
- */
- Const *con = (Const *) arg;
-
- con->consttype = rtype;
- con->consttypmod = rtypmod;
- con->constcollid = rcollid;
- return (Node *) con;
- }
- else if (exprType(arg) == rtype &&
- exprTypmod(arg) == rtypmod &&
- exprCollation(arg) == rcollid)
- {
- /* Sometimes we find a nest of relabels that net out to nothing. */
- return arg;
- }
- else
- {
- /* Nope, gotta have a RelabelType. */
- RelabelType *newrelabel = makeNode(RelabelType);
-
- newrelabel->arg = (Expr *) arg;
- newrelabel->resulttype = rtype;
- newrelabel->resulttypmod = rtypmod;
- newrelabel->resultcollid = rcollid;
- newrelabel->relabelformat = rformat;
- newrelabel->location = rlocation;
- return (Node *) newrelabel;
- }
-}
-
/*
* Subroutine for eval_const_expressions: process arguments of an OR clause
*
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 25545029d7ad1..f9d0d67aa75a6 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -974,11 +974,6 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
/* it has storage, ok to call the smgr */
curpages = RelationGetNumberOfBlocks(rel);
- /* coerce values in pg_class to more desirable types */
- relpages = (BlockNumber) rel->rd_rel->relpages;
- reltuples = (double) rel->rd_rel->reltuples;
- relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
-
/* report estimated # pages */
*pages = curpages;
/* quick exit if rel is clearly empty */
@@ -988,6 +983,7 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
*allvisfrac = 0;
break;
}
+
/* coerce values in pg_class to more desirable types */
relpages = (BlockNumber) rel->rd_rel->relpages;
reltuples = (double) rel->rd_rel->reltuples;
@@ -1006,12 +1002,12 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
}
/* estimate number of tuples from previous tuple density */
- if (relpages > 0)
+ if (reltuples >= 0 && relpages > 0)
density = reltuples / (double) relpages;
else
{
/*
- * When we have no data because the relation was truncated,
+ * If we have no data because the relation was never vacuumed,
* estimate tuple width from attribute datatypes. We assume
* here that the pages are completely full, which is OK for
* tables (since they've presumably not been VACUUMed yet) but
@@ -1059,6 +1055,7 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
break;
case RELKIND_FOREIGN_TABLE:
/* Just use whatever's in pg_class */
+ /* Note that FDW must cope if reltuples is -1! */
*pages = rel->rd_rel->relpages;
*tuples = rel->rd_rel->reltuples;
*allvisfrac = 0;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 25abc544fc721..6c49554defbcb 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -86,7 +86,6 @@ typedef struct
List *ckconstraints; /* CHECK constraints */
List *fkconstraints; /* FOREIGN KEY constraints */
List *ixconstraints; /* index-creating constraints */
- List *inh_indexes; /* cloned indexes from INCLUDING INDEXES */
List *extstats; /* cloned extended statistics */
List *blist; /* "before list" of things to do before
* creating the table */
@@ -154,6 +153,9 @@ static Const *transformPartitionBoundValue(ParseState *pstate, Node *con,
* Returns a List of utility commands to be done in sequence. One of these
* will be the transformed CreateStmt, but there may be additional actions
* to be done before and after the actual DefineRelation() call.
+ * In addition to normal utility commands such as AlterTableStmt and
+ * IndexStmt, the result list may contain TableLikeClause(s), representing
+ * the need to perform additional parse analysis after DefineRelation().
*
* SQL allows constraints to be scattered all over, so thumb through
* the columns and collect all constraints into one place.
@@ -241,7 +243,6 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
cxt.ckconstraints = NIL;
cxt.fkconstraints = NIL;
cxt.ixconstraints = NIL;
- cxt.inh_indexes = NIL;
cxt.extstats = NIL;
cxt.blist = NIL;
cxt.alist = NIL;
@@ -917,18 +918,18 @@ transformTableConstraint(CreateStmtContext *cxt, Constraint *constraint)
* transformTableLikeClause
*
* Change the LIKE portion of a CREATE TABLE statement into
- * column definitions which recreate the user defined column portions of
- * .
+ * column definitions that recreate the user defined column portions of
+ * . Also, if there are any LIKE options that we can't fully
+ * process at this point, add the TableLikeClause to cxt->alist, which
+ * will cause utility.c to call expandTableLikeClause() after the new
+ * table has been created.
*/
static void
transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_clause)
{
AttrNumber parent_attno;
- AttrNumber new_attno;
Relation relation;
TupleDesc tupleDesc;
- TupleConstr *constr;
- AttrMap *attmap;
AclResult aclresult;
char *comment;
ParseCallbackState pcbstate;
@@ -942,6 +943,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("LIKE is not supported for creating foreign tables")));
+ /* Open the relation referenced by the LIKE clause */
relation = relation_openrv(table_like_clause->relation, AccessShareLock);
if (relation->rd_rel->relkind != RELKIND_RELATION &&
@@ -978,37 +980,11 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
}
tupleDesc = RelationGetDescr(relation);
- constr = tupleDesc->constr;
-
- /*
- * Initialize column number map for map_variable_attnos(). We need this
- * since dropped columns in the source table aren't copied, so the new
- * table can have different column numbers.
- */
- attmap = make_attrmap(tupleDesc->natts);
-
- /*
- * We must fill the attmap now so that it can be used to process generated
- * column default expressions in the per-column loop below.
- */
- new_attno = 1;
- for (parent_attno = 1; parent_attno <= tupleDesc->natts;
- parent_attno++)
- {
- Form_pg_attribute attribute = TupleDescAttr(tupleDesc,
- parent_attno - 1);
-
- /*
- * Ignore dropped columns in the parent. attmap entry is left zero.
- */
- if (attribute->attisdropped)
- continue;
-
- attmap->attnums[parent_attno - 1] = list_length(cxt->columns) + (new_attno++);
- }
/*
* Insert the copied attributes into the cxt for the new table definition.
+ * We must do this now so that they appear in the table in the relative
+ * position where the LIKE clause is, as required by SQL99.
*/
for (parent_attno = 1; parent_attno <= tupleDesc->natts;
parent_attno++)
@@ -1052,52 +1028,12 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
cxt->columns = lappend(cxt->columns, def);
/*
- * Copy default, if present and it should be copied. We have separate
- * options for plain default expressions and GENERATED defaults.
+ * Although we don't transfer the column's default/generation
+ * expression now, we need to mark it GENERATED if appropriate.
*/
- if (attribute->atthasdef &&
- (attribute->attgenerated ?
- (table_like_clause->options & CREATE_TABLE_LIKE_GENERATED) :
- (table_like_clause->options & CREATE_TABLE_LIKE_DEFAULTS)))
- {
- Node *this_default = NULL;
- AttrDefault *attrdef;
- int i;
- bool found_whole_row;
-
- /* Find default in constraint structure */
- Assert(constr != NULL);
- attrdef = constr->defval;
- for (i = 0; i < constr->num_defval; i++)
- {
- if (attrdef[i].adnum == parent_attno)
- {
- this_default = stringToNode(attrdef[i].adbin);
- break;
- }
- }
- Assert(this_default != NULL);
-
- def->cooked_default = map_variable_attnos(this_default,
- 1, 0,
- attmap,
- InvalidOid, &found_whole_row);
-
- /*
- * Prevent this for the same reason as for constraints below. Note
- * that defaults cannot contain any vars, so it's OK that the
- * error message refers to generated columns.
- */
- if (found_whole_row)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("cannot convert whole-row table reference"),
- errdetail("Generation expression for column \"%s\" contains a whole-row reference to table \"%s\".",
- attributeName,
- RelationGetRelationName(relation))));
-
+ if (attribute->atthasdef && attribute->attgenerated &&
+ (table_like_clause->options & CREATE_TABLE_LIKE_GENERATED))
def->generated = attribute->attgenerated;
- }
/*
* Copy identity if requested
@@ -1145,14 +1081,191 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
}
}
+ /*
+ * We cannot yet deal with defaults, CHECK constraints, or indexes, since
+ * we don't yet know what column numbers the copied columns will have in
+ * the finished table. If any of those options are specified, add the
+ * LIKE clause to cxt->alist so that expandTableLikeClause will be called
+ * after we do know that.
+ */
+ if (table_like_clause->options &
+ (CREATE_TABLE_LIKE_DEFAULTS |
+ CREATE_TABLE_LIKE_GENERATED |
+ CREATE_TABLE_LIKE_CONSTRAINTS |
+ CREATE_TABLE_LIKE_INDEXES))
+ cxt->alist = lappend(cxt->alist, table_like_clause);
+
+ /*
+ * We may copy extended statistics if requested, since the representation
+ * of CreateStatsStmt doesn't depend on column numbers.
+ */
+ if (table_like_clause->options & CREATE_TABLE_LIKE_STATISTICS)
+ {
+ List *parent_extstats;
+ ListCell *l;
+
+ parent_extstats = RelationGetStatExtList(relation);
+
+ foreach(l, parent_extstats)
+ {
+ Oid parent_stat_oid = lfirst_oid(l);
+ CreateStatsStmt *stats_stmt;
+
+ stats_stmt = generateClonedExtStatsStmt(cxt->relation,
+ RelationGetRelid(relation),
+ parent_stat_oid);
+
+ /* Copy comment on statistics object, if requested */
+ if (table_like_clause->options & CREATE_TABLE_LIKE_COMMENTS)
+ {
+ comment = GetComment(parent_stat_oid, StatisticExtRelationId, 0);
+
+ /*
+ * We make use of CreateStatsStmt's stxcomment option, so as
+ * not to need to know now what name the statistics will have.
+ */
+ stats_stmt->stxcomment = comment;
+ }
+
+ cxt->extstats = lappend(cxt->extstats, stats_stmt);
+ }
+
+ list_free(parent_extstats);
+ }
+
+ /*
+ * Close the parent rel, but keep our AccessShareLock on it until xact
+ * commit. That will prevent someone else from deleting or ALTERing the
+ * parent before we can run expandTableLikeClause.
+ */
+ table_close(relation, NoLock);
+}
+
+/*
+ * expandTableLikeClause
+ *
+ * Process LIKE options that require knowing the final column numbers
+ * assigned to the new table's columns. This executes after we have
+ * run DefineRelation for the new table. It returns a list of utility
+ * commands that should be run to generate indexes etc.
+ */
+List *
+expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause)
+{
+ List *result = NIL;
+ List *atsubcmds = NIL;
+ AttrNumber parent_attno;
+ Relation relation;
+ Relation childrel;
+ TupleDesc tupleDesc;
+ TupleConstr *constr;
+ AttrMap *attmap;
+ char *comment;
+
+ /*
+ * Open the relation referenced by the LIKE clause. We should still have
+ * the table lock obtained by transformTableLikeClause (and this'll throw
+ * an assertion failure if not). Hence, no need to recheck privileges
+ * etc.
+ */
+ relation = relation_openrv(table_like_clause->relation, NoLock);
+
+ tupleDesc = RelationGetDescr(relation);
+ constr = tupleDesc->constr;
+
+ /*
+ * Open the newly-created child relation; we have lock on that too.
+ */
+ childrel = relation_openrv(heapRel, NoLock);
+
+ /*
+ * Construct a map from the LIKE relation's attnos to the child rel's.
+ * This re-checks type match etc, although it shouldn't be possible to
+ * have a failure since both tables are locked.
+ */
+ attmap = build_attrmap_by_name(RelationGetDescr(childrel),
+ tupleDesc);
+
+ /*
+ * Process defaults, if required.
+ */
+ if ((table_like_clause->options &
+ (CREATE_TABLE_LIKE_DEFAULTS | CREATE_TABLE_LIKE_GENERATED)) &&
+ constr != NULL)
+ {
+ AttrDefault *attrdef = constr->defval;
+
+ for (parent_attno = 1; parent_attno <= tupleDesc->natts;
+ parent_attno++)
+ {
+ Form_pg_attribute attribute = TupleDescAttr(tupleDesc,
+ parent_attno - 1);
+
+ /*
+ * Ignore dropped columns in the parent.
+ */
+ if (attribute->attisdropped)
+ continue;
+
+ /*
+ * Copy default, if present and it should be copied. We have
+ * separate options for plain default expressions and GENERATED
+ * defaults.
+ */
+ if (attribute->atthasdef &&
+ (attribute->attgenerated ?
+ (table_like_clause->options & CREATE_TABLE_LIKE_GENERATED) :
+ (table_like_clause->options & CREATE_TABLE_LIKE_DEFAULTS)))
+ {
+ Node *this_default = NULL;
+ AlterTableCmd *atsubcmd;
+ bool found_whole_row;
+
+ /* Find default in constraint structure */
+ for (int i = 0; i < constr->num_defval; i++)
+ {
+ if (attrdef[i].adnum == parent_attno)
+ {
+ this_default = stringToNode(attrdef[i].adbin);
+ break;
+ }
+ }
+ Assert(this_default != NULL);
+
+ atsubcmd = makeNode(AlterTableCmd);
+ atsubcmd->subtype = AT_CookedColumnDefault;
+ atsubcmd->num = attmap->attnums[parent_attno - 1];
+ atsubcmd->def = map_variable_attnos(this_default,
+ 1, 0,
+ attmap,
+ InvalidOid,
+ &found_whole_row);
+
+ /*
+ * Prevent this for the same reason as for constraints below.
+ * Note that defaults cannot contain any vars, so it's OK that
+ * the error message refers to generated columns.
+ */
+ if (found_whole_row)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot convert whole-row table reference"),
+ errdetail("Generation expression for column \"%s\" contains a whole-row reference to table \"%s\".",
+ NameStr(attribute->attname),
+ RelationGetRelationName(relation))));
+
+ atsubcmds = lappend(atsubcmds, atsubcmd);
+ }
+ }
+ }
+
/*
* Copy CHECK constraints if requested, being careful to adjust attribute
* numbers so they match the child.
*/
if ((table_like_clause->options & CREATE_TABLE_LIKE_CONSTRAINTS) &&
- tupleDesc->constr)
+ constr != NULL)
{
- TupleConstr *constr = tupleDesc->constr;
int ccnum;
for (ccnum = 0; ccnum < constr->num_check; ccnum++)
@@ -1160,9 +1273,10 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
char *ccname = constr->check[ccnum].ccname;
char *ccbin = constr->check[ccnum].ccbin;
bool ccnoinherit = constr->check[ccnum].ccnoinherit;
- Constraint *n = makeNode(Constraint);
Node *ccbin_node;
bool found_whole_row;
+ Constraint *n;
+ AlterTableCmd *atsubcmd;
ccbin_node = map_variable_attnos(stringToNode(ccbin),
1, 0,
@@ -1183,13 +1297,22 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
ccname,
RelationGetRelationName(relation))));
+ n = makeNode(Constraint);
n->contype = CONSTR_CHECK;
n->conname = pstrdup(ccname);
n->location = -1;
n->is_no_inherit = ccnoinherit;
n->raw_expr = NULL;
n->cooked_expr = nodeToString(ccbin_node);
- cxt->ckconstraints = lappend(cxt->ckconstraints, n);
+
+ /* We can skip validation, since the new table should be empty. */
+ n->skip_validation = true;
+ n->initially_valid = true;
+
+ atsubcmd = makeNode(AlterTableCmd);
+ atsubcmd->subtype = AT_AddConstraint;
+ atsubcmd->def = (Node *) n;
+ atsubcmds = lappend(atsubcmds, atsubcmd);
/* Copy comment on constraint */
if ((table_like_clause->options & CREATE_TABLE_LIKE_COMMENTS) &&
@@ -1201,18 +1324,34 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
CommentStmt *stmt = makeNode(CommentStmt);
stmt->objtype = OBJECT_TABCONSTRAINT;
- stmt->object = (Node *) list_make3(makeString(cxt->relation->schemaname),
- makeString(cxt->relation->relname),
+ stmt->object = (Node *) list_make3(makeString(heapRel->schemaname),
+ makeString(heapRel->relname),
makeString(n->conname));
stmt->comment = comment;
- cxt->alist = lappend(cxt->alist, stmt);
+ result = lappend(result, stmt);
}
}
}
/*
- * Likewise, copy indexes if requested
+ * If we generated any ALTER TABLE actions above, wrap them into a single
+ * ALTER TABLE command. Stick it at the front of the result, so it runs
+ * before any CommentStmts we made above.
+ */
+ if (atsubcmds)
+ {
+ AlterTableStmt *atcmd = makeNode(AlterTableStmt);
+
+ atcmd->relation = copyObject(heapRel);
+ atcmd->cmds = atsubcmds;
+ atcmd->objtype = OBJECT_TABLE;
+ atcmd->missing_ok = false;
+ result = lcons(atcmd, result);
+ }
+
+ /*
+ * Process indexes if required.
*/
if ((table_like_clause->options & CREATE_TABLE_LIKE_INDEXES) &&
relation->rd_rel->relhasindex)
@@ -1231,7 +1370,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
parent_index = index_open(parent_index_oid, AccessShareLock);
/* Build CREATE INDEX statement to recreate the parent_index */
- index_stmt = generateClonedIndexStmt(cxt->relation,
+ index_stmt = generateClonedIndexStmt(heapRel,
parent_index,
attmap,
NULL);
@@ -1248,49 +1387,14 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
index_stmt->idxcomment = comment;
}
- /* Save it in the inh_indexes list for the time being */
- cxt->inh_indexes = lappend(cxt->inh_indexes, index_stmt);
+ result = lappend(result, index_stmt);
index_close(parent_index, AccessShareLock);
}
}
- /*
- * Likewise, copy extended statistics if requested
- */
- if (table_like_clause->options & CREATE_TABLE_LIKE_STATISTICS)
- {
- List *parent_extstats;
- ListCell *l;
-
- parent_extstats = RelationGetStatExtList(relation);
-
- foreach(l, parent_extstats)
- {
- Oid parent_stat_oid = lfirst_oid(l);
- CreateStatsStmt *stats_stmt;
-
- stats_stmt = generateClonedExtStatsStmt(cxt->relation,
- RelationGetRelid(relation),
- parent_stat_oid);
-
- /* Copy comment on statistics object, if requested */
- if (table_like_clause->options & CREATE_TABLE_LIKE_COMMENTS)
- {
- comment = GetComment(parent_stat_oid, StatisticExtRelationId, 0);
-
- /*
- * We make use of CreateStatsStmt's stxcomment option, so as
- * not to need to know now what name the statistics will have.
- */
- stats_stmt->stxcomment = comment;
- }
-
- cxt->extstats = lappend(cxt->extstats, stats_stmt);
- }
-
- list_free(parent_extstats);
- }
+ /* Done with child rel */
+ table_close(childrel, NoLock);
/*
* Close the parent rel, but keep our AccessShareLock on it until xact
@@ -1298,6 +1402,8 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
* parent before the child is committed.
*/
table_close(relation, NoLock);
+
+ return result;
}
static void
@@ -1590,7 +1696,7 @@ generateClonedIndexStmt(RangeVar *heapRel, Relation source_idx,
attmap,
InvalidOid, &found_whole_row);
- /* As in transformTableLikeClause, reject whole-row variables */
+ /* As in expandTableLikeClause, reject whole-row variables */
if (found_whole_row)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
@@ -1699,7 +1805,7 @@ generateClonedIndexStmt(RangeVar *heapRel, Relation source_idx,
attmap,
InvalidOid, &found_whole_row);
- /* As in transformTableLikeClause, reject whole-row variables */
+ /* As in expandTableLikeClause, reject whole-row variables */
if (found_whole_row)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
@@ -1897,24 +2003,6 @@ transformIndexConstraints(CreateStmtContext *cxt)
indexlist = lappend(indexlist, index);
}
- /* Add in any indexes defined by LIKE ... INCLUDING INDEXES */
- foreach(lc, cxt->inh_indexes)
- {
- index = (IndexStmt *) lfirst(lc);
-
- if (index->primary)
- {
- if (cxt->pkey != NULL)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
- errmsg("multiple primary keys for table \"%s\" are not allowed",
- cxt->relation->relname)));
- cxt->pkey = index;
- }
-
- indexlist = lappend(indexlist, index);
- }
-
/*
* Scan the index list and remove any redundant index specifications. This
* can happen if, for instance, the user writes UNIQUE PRIMARY KEY. A
@@ -3115,7 +3203,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
cxt.ckconstraints = NIL;
cxt.fkconstraints = NIL;
cxt.ixconstraints = NIL;
- cxt.inh_indexes = NIL;
cxt.extstats = NIL;
cxt.blist = NIL;
cxt.alist = NIL;
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index c6ec657a9367c..1b8cd7bacd43c 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -3080,6 +3080,10 @@ relation_needs_vacanalyze(Oid relid,
instuples = tabentry->inserts_since_vacuum;
anltuples = tabentry->changes_since_analyze;
+ /* If the table hasn't yet been vacuumed, take reltuples as zero */
+ if (reltuples < 0)
+ reltuples = 0;
+
vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
vacinsthresh = (float4) vac_ins_base_thresh + vac_ins_scale_factor * reltuples;
anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 73ce944fb1ce9..e6643ad66ca48 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3779,8 +3779,20 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_HASH_BATCH_ELECT:
event_name = "HashBatchElect";
break;
- case WAIT_EVENT_HASH_BATCH_LOAD:
- event_name = "HashBatchLoad";
+ case WAIT_EVENT_HASH_STRIPE_ELECT:
+ event_name = "HashStripeElect";
+ break;
+ case WAIT_EVENT_HASH_STRIPE_RESET:
+ event_name = "HashStripeReset";
+ break;
+ case WAIT_EVENT_HASH_STRIPE_LOAD:
+ event_name = "HashStripeLoad";
+ break;
+ case WAIT_EVENT_HASH_STRIPE_OVERFLOW:
+ event_name = "HashStripeOverflow";
+ break;
+ case WAIT_EVENT_HASH_STRIPE_PROBE:
+ event_name = "HashStripeProbe";
break;
case WAIT_EVENT_HASH_BUILD_ALLOCATE:
event_name = "HashBuildAllocate";
@@ -3794,6 +3806,21 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_HASH_BUILD_HASH_OUTER:
event_name = "HashBuildHashOuter";
break;
+ case WAIT_EVENT_HASH_EVICT_ELECT:
+ event_name = "HashEvictElect";
+ break;
+ case WAIT_EVENT_HASH_EVICT_RESET:
+ event_name = "HashEvictReset";
+ break;
+ case WAIT_EVENT_HASH_EVICT_SPILL:
+ event_name = "HashEvictSpill";
+ break;
+ case WAIT_EVENT_HASH_EVICT_FINISH:
+ event_name = "HashEvictFinish";
+ break;
+ case WAIT_EVENT_HASH_REPARTITION_BATCH0_DRAIN_QUEUE:
+ event_name = "HashRepartitionBatch0DrainQueue";
+ break;
case WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE:
event_name = "HashGrowBatchesAllocate";
break;
@@ -3940,6 +3967,9 @@ pgstat_get_wait_io(WaitEventIO w)
case WAIT_EVENT_BUFFILE_WRITE:
event_name = "BufFileWrite";
break;
+ case WAIT_EVENT_BUFFILE_TRUNCATE:
+ event_name = "BufFileTruncate";
+ break;
case WAIT_EVENT_CONTROL_FILE_READ:
event_name = "ControlFileRead";
break;
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index e9701ea722154..9d5d68f3fa785 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -524,6 +524,7 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
snapshot->curcid = FirstCommandId;
snapshot->active_count = 0;
snapshot->regd_count = 0;
+ snapshot->snapXactCompletionCount = 0;
return snapshot;
}
diff --git a/src/backend/rewrite/rewriteDefine.c b/src/backend/rewrite/rewriteDefine.c
index 9989df1107468..8ef0917021cf9 100644
--- a/src/backend/rewrite/rewriteDefine.c
+++ b/src/backend/rewrite/rewriteDefine.c
@@ -621,7 +621,7 @@ DefineQueryRewrite(const char *rulename,
classForm->relam = InvalidOid;
classForm->reltablespace = InvalidOid;
classForm->relpages = 0;
- classForm->reltuples = 0;
+ classForm->reltuples = -1;
classForm->relallvisible = 0;
classForm->reltoastrelid = InvalidOid;
classForm->relhasindex = false;
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index 3e37e2758ca0c..4e30abb674378 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -1246,7 +1246,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
* of clauses. We must return 1.0 so the calling function's selectivity is
* unaffected.
*/
- if (bms_num_members(clauses_attnums) < 2)
+ if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
{
bms_free(clauses_attnums);
pfree(list_attnums);
@@ -1273,18 +1273,18 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
{
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
Bitmapset *matched;
- int num_matched;
+ BMS_Membership membership;
/* skip statistics that are not of the correct type */
if (stat->kind != STATS_EXT_DEPENDENCIES)
continue;
matched = bms_intersect(clauses_attnums, stat->keys);
- num_matched = bms_num_members(matched);
+ membership = bms_membership(matched);
bms_free(matched);
/* skip objects matching fewer than two attributes from clauses */
- if (num_matched < 2)
+ if (membership != BMS_MULTIPLE)
continue;
func_dependencies[nfunc_dependencies]
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f1ae6f9f84430..a2a963bd5b41f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3578,7 +3578,7 @@ IncrBufferRefCount(Buffer buffer)
* This is essentially the same as MarkBufferDirty, except:
*
* 1. The caller does not write WAL; so if checksums are enabled, we may need
- * to write an XLOG_FPI WAL record to protect against torn pages.
+ * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
* 2. The caller might have only share-lock instead of exclusive-lock on the
* buffer's content lock.
* 3. This function does not guarantee that the buffer is always marked dirty
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 2d7a08232089d..2e1ced49db002 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -32,10 +32,14 @@
* (by opening multiple fd.c temporary files). This is an essential feature
* for sorts and hashjoins on large amounts of data.
*
- * BufFile supports temporary files that can be made read-only and shared with
- * other backends, as infrastructure for parallel execution. Such files need
- * to be created as a member of a SharedFileSet that all participants are
- * attached to.
+ * BufFile supports temporary files that can be shared with other backends, as
+ * infrastructure for parallel execution. Such files need to be created as a
+ * member of a SharedFileSet that all participants are attached to.
+ *
+ * BufFile also supports temporary files that can be used by the single backend
+ * when the corresponding files need to be survived across the transaction and
+ * need to be opened and closed multiple times. Such files need to be created
+ * as a member of a SharedFileSet.
*-------------------------------------------------------------------------
*/
@@ -277,7 +281,7 @@ BufFileCreateShared(SharedFileSet *fileset, const char *name)
* backends and render it read-only.
*/
BufFile *
-BufFileOpenShared(SharedFileSet *fileset, const char *name)
+BufFileOpenShared(SharedFileSet *fileset, const char *name, int mode)
{
BufFile *file;
char segment_name[MAXPGPATH];
@@ -301,7 +305,7 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name)
}
/* Try to load a segment. */
SharedSegmentName(segment_name, name, nfiles);
- files[nfiles] = SharedFileSetOpen(fileset, segment_name);
+ files[nfiles] = SharedFileSetOpen(fileset, segment_name, mode);
if (files[nfiles] <= 0)
break;
++nfiles;
@@ -316,12 +320,12 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name)
if (nfiles == 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
- segment_name, name)));
+ errmsg("%d: could not open temporary file \"%s\" from BufFile \"%s\": %m",
+ MyProcPid, segment_name, name)));
file = makeBufFileCommon(nfiles);
file->files = files;
- file->readOnly = true; /* Can't write to files opened this way */
+ file->readOnly = (mode == O_RDONLY) ? true : false;
file->fileset = fileset;
file->name = pstrdup(name);
@@ -666,11 +670,21 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
newFile = file->curFile;
newOffset = (file->curOffset + file->pos) + offset;
break;
-#ifdef NOT_USED
case SEEK_END:
- /* could be implemented, not needed currently */
+
+ /*
+ * The file size of the last file gives us the end offset of that
+ * file.
+ */
+ newFile = file->numFiles - 1;
+ newOffset = FileSize(file->files[file->numFiles - 1]);
+ if (newOffset < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+ FilePathName(file->files[file->numFiles - 1]),
+ file->name)));
break;
-#endif
default:
elog(ERROR, "invalid whence: %d", whence);
return EOF;
@@ -838,3 +852,98 @@ BufFileAppend(BufFile *target, BufFile *source)
return startBlock;
}
+
+/*
+ * Truncate a BufFile created by BufFileCreateShared up to the given fileno and
+ * the offset.
+ */
+void
+BufFileTruncateShared(BufFile *file, int fileno, off_t offset)
+{
+ int numFiles = file->numFiles;
+ int newFile = fileno;
+ off_t newOffset = file->curOffset;
+ char segment_name[MAXPGPATH];
+ int i;
+
+ /*
+ * Loop over all the files up to the given fileno and remove the files
+ * that are greater than the fileno and truncate the given file up to the
+ * offset. Note that we also remove the given fileno if the offset is 0
+ * provided it is not the first file in which we truncate it.
+ */
+ for (i = file->numFiles - 1; i >= fileno; i--)
+ {
+ if ((i != fileno || offset == 0) && i != 0)
+ {
+ SharedSegmentName(segment_name, file->name, i);
+ FileClose(file->files[i]);
+ if (!SharedFileSetDelete(file->fileset, segment_name, true))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not delete shared fileset \"%s\": %m",
+ segment_name)));
+ numFiles--;
+ newOffset = MAX_PHYSICAL_FILESIZE;
+
+ /*
+ * This is required to indicate that we have deleted the given
+ * fileno.
+ */
+ if (i == fileno)
+ newFile--;
+ }
+ else
+ {
+ if (FileTruncate(file->files[i], offset,
+ WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m",
+ FilePathName(file->files[i]))));
+ newOffset = offset;
+ }
+ }
+
+ file->numFiles = numFiles;
+
+ /*
+ * If the truncate point is within existing buffer then we can just adjust
+ * pos within buffer.
+ */
+ if (newFile == file->curFile &&
+ newOffset >= file->curOffset &&
+ newOffset <= file->curOffset + file->nbytes)
+ {
+ /* No need to reset the current pos if the new pos is greater. */
+ if (newOffset <= file->curOffset + file->pos)
+ file->pos = (int) (newOffset - file->curOffset);
+
+ /* Adjust the nbytes for the current buffer. */
+ file->nbytes = (int) (newOffset - file->curOffset);
+ }
+ else if (newFile == file->curFile &&
+ newOffset < file->curOffset)
+ {
+ /*
+ * The truncate point is within the existing file but prior to the
+ * current position, so we can forget the current buffer and reset the
+ * current position.
+ */
+ file->curOffset = newOffset;
+ file->pos = 0;
+ file->nbytes = 0;
+ }
+ else if (newFile < file->curFile)
+ {
+ /*
+ * The truncate point is prior to the current file, so need to reset
+ * the current position accordingly.
+ */
+ file->curFile = newFile;
+ file->curOffset = newOffset;
+ file->pos = 0;
+ file->nbytes = 0;
+ }
+ /* Nothing to do, if the truncate point is beyond current file. */
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 5f6420efb2d76..f05abbec56f99 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1743,18 +1743,17 @@ PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
/*
* Open a file that was created with PathNameCreateTemporaryFile, possibly in
* another backend. Files opened this way don't count against the
- * temp_file_limit of the caller, are read-only and are automatically closed
- * at the end of the transaction but are not deleted on close.
+ * temp_file_limit of the caller, are automatically closed at the end of the
+ * transaction but are not deleted on close.
*/
File
-PathNameOpenTemporaryFile(const char *path)
+PathNameOpenTemporaryFile(const char *path, int mode)
{
File file;
ResourceOwnerEnlargeFiles(CurrentResourceOwner);
- /* We open the file read-only. */
- file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
+ file = PathNameOpenFile(path, mode | PG_BINARY);
/* If no such file, then we don't raise an error. */
if (file <= 0 && errno != ENOENT)
@@ -1772,6 +1771,7 @@ PathNameOpenTemporaryFile(const char *path)
return file;
}
+
/*
* Delete a file by pathname. Return true if the file existed, false if
* didn't.
diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c
index 16b7594756c66..8b96e81fffff9 100644
--- a/src/backend/storage/file/sharedfileset.c
+++ b/src/backend/storage/file/sharedfileset.c
@@ -13,6 +13,10 @@
* files can be discovered by name, and a shared ownership semantics so that
* shared files survive until the last user detaches.
*
+ * SharedFileSets can be used by backends when the temporary files need to be
+ * opened/closed multiple times and the underlying files need to survive across
+ * transactions.
+ *
*-------------------------------------------------------------------------
*/
@@ -25,25 +29,36 @@
#include "common/hashfn.h"
#include "miscadmin.h"
#include "storage/dsm.h"
+#include "storage/ipc.h"
#include "storage/sharedfileset.h"
#include "utils/builtins.h"
+static List *filesetlist = NIL;
+
static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
+static void SharedFileSetDeleteOnProcExit(int status, Datum arg);
static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace);
static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name);
static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name);
/*
- * Initialize a space for temporary files that can be opened for read-only
- * access by other backends. Other backends must attach to it before
- * accessing it. Associate this SharedFileSet with 'seg'. Any contained
- * files will be deleted when the last backend detaches.
+ * Initialize a space for temporary files that can be opened by other backends.
+ * Other backends must attach to it before accessing it. Associate this
+ * SharedFileSet with 'seg'. Any contained files will be deleted when the
+ * last backend detaches.
+ *
+ * We can also use this interface if the temporary files are used only by
+ * single backend but the files need to be opened and closed multiple times
+ * and also the underlying files need to survive across transactions. For
+ * such cases, dsm segment 'seg' should be passed as NULL. Callers are
+ * expected to explicitly remove such files by using SharedFileSetDelete/
+ * SharedFileSetDeleteAll or we remove such files on proc exit.
*
* Files will be distributed over the tablespaces configured in
* temp_tablespaces.
*
* Under the covers the set is one or more directories which will eventually
- * be deleted when there are no backends attached.
+ * be deleted.
*/
void
SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
@@ -84,7 +99,25 @@ SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
}
/* Register our cleanup callback. */
- on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+ if (seg)
+ on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+ else
+ {
+ static bool registered_cleanup = false;
+
+ if (!registered_cleanup)
+ {
+ /*
+ * We must not have registered any fileset before registering the
+ * fileset clean up.
+ */
+ Assert(filesetlist == NIL);
+ on_proc_exit(SharedFileSetDeleteOnProcExit, 0);
+ registered_cleanup = true;
+ }
+
+ filesetlist = lcons((void *) fileset, filesetlist);
+ }
}
/*
@@ -147,13 +180,13 @@ SharedFileSetCreate(SharedFileSet *fileset, const char *name)
* another backend.
*/
File
-SharedFileSetOpen(SharedFileSet *fileset, const char *name)
+SharedFileSetOpen(SharedFileSet *fileset, const char *name, int mode)
{
char path[MAXPGPATH];
File file;
SharedFilePath(path, fileset, name);
- file = PathNameOpenTemporaryFile(path);
+ file = PathNameOpenTemporaryFile(path, mode);
return file;
}
@@ -192,6 +225,9 @@ SharedFileSetDeleteAll(SharedFileSet *fileset)
SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]);
PathNameDeleteTemporaryDir(dirpath);
}
+
+ /* Unregister the shared fileset */
+ SharedFileSetUnregister(fileset);
}
/*
@@ -222,6 +258,58 @@ SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
SharedFileSetDeleteAll(fileset);
}
+/*
+ * Callback function that will be invoked on the process exit. This will
+ * process the list of all the registered sharedfilesets and delete the
+ * underlying files.
+ */
+static void
+SharedFileSetDeleteOnProcExit(int status, Datum arg)
+{
+ ListCell *l;
+
+ /* Loop over all the pending shared fileset entry */
+ foreach(l, filesetlist)
+ {
+ SharedFileSet *fileset = (SharedFileSet *) lfirst(l);
+
+ SharedFileSetDeleteAll(fileset);
+ }
+
+ filesetlist = NIL;
+}
+
+/*
+ * Unregister the shared fileset entry registered for cleanup on proc exit.
+ */
+void
+SharedFileSetUnregister(SharedFileSet *input_fileset)
+{
+ ListCell *l;
+
+ /*
+ * If the caller is following the dsm based cleanup then we don't maintain
+ * the filesetlist so return.
+ */
+ if (filesetlist == NIL)
+ return;
+
+ foreach(l, filesetlist)
+ {
+ SharedFileSet *fileset = (SharedFileSet *) lfirst(l);
+
+ /* Remove the entry from the list */
+ if (input_fileset == fileset)
+ {
+ filesetlist = list_delete_cell(filesetlist, l);
+ return;
+ }
+ }
+
+ /* Should have found a match */
+ Assert(false);
+}
+
/*
* Build the path for the directory holding the files backing a SharedFileSet
* in a given tablespace.
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 8262abd42e6bd..a023090fbbd3d 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -146,7 +146,7 @@ typedef struct ProcArrayStruct
* I.e. the difference to GlobalVisSharedRels is that
* snapshot in other databases are ignored.
*
- * 3) GlobalVisCatalogRels, which only considers an XID's
+ * 3) GlobalVisDataRels, which only considers an XID's
* effects visible-to-everyone if neither snapshots in the current
* database, nor a replication slot's xmin consider XID as running.
*
@@ -198,7 +198,7 @@ typedef struct ComputeXidHorizonsResult
* be removed.
*
* This likely should only be needed to determine whether pg_subtrans can
- * be truncated. It currently includes the effects of replications slots,
+ * be truncated. It currently includes the effects of replication slots,
* for historical reasons. But that could likely be changed.
*/
TransactionId oldest_considered_running;
@@ -207,7 +207,7 @@ typedef struct ComputeXidHorizonsResult
* Oldest xid for which deleted tuples need to be retained in shared
* tables.
*
- * This includes the effects of replications lots. If that's not desired,
+ * This includes the effects of replication slots. If that's not desired,
* look at shared_oldest_nonremovable_raw;
*/
TransactionId shared_oldest_nonremovable;
@@ -407,6 +407,7 @@ CreateSharedProcArray(void)
procArray->lastOverflowedXid = InvalidTransactionId;
procArray->replication_slot_xmin = InvalidTransactionId;
procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+ ShmemVariableCache->xactCompletionCount = 1;
}
allProcs = ProcGlobal->allProcs;
@@ -534,6 +535,9 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
/* Advance global latestCompletedXid while holding the lock */
MaintainLatestCompletedXid(latestXid);
+ /* Same with xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
ProcGlobal->xids[proc->pgxactoff] = 0;
ProcGlobal->subxidStates[proc->pgxactoff].overflowed = false;
ProcGlobal->subxidStates[proc->pgxactoff].count = 0;
@@ -667,6 +671,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
{
size_t pgxactoff = proc->pgxactoff;
+ Assert(LWLockHeldByMe(ProcArrayLock));
Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
@@ -698,6 +703,9 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
/* Also advance global latestCompletedXid while holding the lock */
MaintainLatestCompletedXid(latestXid);
+
+ /* Same with xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
}
/*
@@ -832,13 +840,20 @@ ProcArrayClearTransaction(PGPROC *proc)
size_t pgxactoff;
/*
- * We can skip locking ProcArrayLock exclusively here, because this action
- * does not actually change anyone's view of the set of running XIDs: our
- * entry is duplicate with the gxact that has already been inserted into
- * the ProcArray. But need it in shared mode for pgproc->pgxactoff to stay
- * the same.
+ * Currently we need to lock ProcArrayLock exclusively here, as we
+ * increment xactCompletionCount below. We also need it at least in shared
+ * mode for pgproc->pgxactoff to stay the same below.
+ *
+ * We could however, as this action does not actually change anyone's view
+ * of the set of running XIDs (our entry is duplicate with the gxact that
+ * has already been inserted into the ProcArray), lower the lock level to
+ * shared if we were to make xactCompletionCount an atomic variable. But
+ * that doesn't seem worth it currently, as a 2PC commit is heavyweight
+ * enough for this not to be the bottleneck. If it ever becomes a
+ * bottleneck it may also be worth considering to combine this with the
+ * subsequent ProcArrayRemove()
*/
- LWLockAcquire(ProcArrayLock, LW_SHARED);
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
pgxactoff = proc->pgxactoff;
@@ -852,6 +867,15 @@ ProcArrayClearTransaction(PGPROC *proc)
Assert(!(proc->vacuumFlags & PROC_VACUUM_STATE_MASK));
Assert(!proc->delayChkpt);
+ /*
+ * Need to increment completion count even though transaction hasn't
+ * really committed yet. The reason for that is that GetSnapshotData()
+ * omits the xid of the current transaction, thus without the increment we
+ * otherwise could end up reusing the snapshot later. Which would be bad,
+ * because it might not count the prepared transaction as running.
+ */
+ ShmemVariableCache->xactCompletionCount++;
+
/* Clear the subtransaction-XID cache too */
Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
@@ -1663,7 +1687,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
TransactionId xmin;
/* Fetch xid just once - see GetNewTransactionId */
- xid = UINT32_ACCESS_ONCE(other_xids[pgprocno]);
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
xmin = UINT32_ACCESS_ONCE(proc->xmin);
/*
@@ -1916,6 +1940,93 @@ GetMaxSnapshotSubxidCount(void)
return TOTAL_MAX_CACHED_SUBXIDS;
}
+/*
+ * Initialize old_snapshot_threshold specific parts of a newly build snapshot.
+ */
+static void
+GetSnapshotDataInitOldSnapshot(Snapshot snapshot)
+{
+ if (!OldSnapshotThresholdActive())
+ {
+ /*
+ * If not using "snapshot too old" feature, fill related fields with
+ * dummy values that don't require any locking.
+ */
+ snapshot->lsn = InvalidXLogRecPtr;
+ snapshot->whenTaken = 0;
+ }
+ else
+ {
+ /*
+ * Capture the current time and WAL stream location in case this
+ * snapshot becomes old enough to need to fall back on the special
+ * "old snapshot" logic.
+ */
+ snapshot->lsn = GetXLogInsertRecPtr();
+ snapshot->whenTaken = GetSnapshotCurrentTimestamp();
+ MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin);
+ }
+}
+
+/*
+ * Helper function for GetSnapshotData() that checks if the bulk of the
+ * visibility information in the snapshot is still valid. If so, it updates
+ * the fields that need to change and returns true. Otherwise it returns
+ * false.
+ *
+ * This very likely can be evolved to not need ProcArrayLock held (at very
+ * least in the case we already hold a snapshot), but that's for another day.
+ */
+static bool
+GetSnapshotDataReuse(Snapshot snapshot)
+{
+ uint64 curXactCompletionCount;
+
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ if (unlikely(snapshot->snapXactCompletionCount == 0))
+ return false;
+
+ curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+ if (curXactCompletionCount != snapshot->snapXactCompletionCount)
+ return false;
+
+ /*
+ * If the current xactCompletionCount is still the same as it was at the
+ * time the snapshot was built, we can be sure that rebuilding the
+ * contents of the snapshot the hard way would result in the same snapshot
+ * contents:
+ *
+ * As explained in transam/README, the set of xids considered running by
+ * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot
+ * contents only depend on transactions with xids and xactCompletionCount
+ * is incremented whenever a transaction with an xid finishes (while
+ * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check
+ * ensures we would detect if the snapshot would have changed.
+ *
+ * As the snapshot contents are the same as it was before, it is is safe
+ * to re-enter the snapshot's xmin into the PGPROC array. None of the rows
+ * visible under the snapshot could already have been removed (that'd
+ * require the set of running transactions to change) and it fulfills the
+ * requirement that concurrent GetSnapshotData() calls yield the same
+ * xmin.
+ */
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+ RecentXmin = snapshot->xmin;
+ Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+ snapshot->curcid = GetCurrentCommandId(false);
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+
+ GetSnapshotDataInitOldSnapshot(snapshot);
+
+ return true;
+}
+
/*
* GetSnapshotData -- returns information about running transactions.
*
@@ -1963,6 +2074,7 @@ GetSnapshotData(Snapshot snapshot)
TransactionId oldestxid;
int mypgxactoff;
TransactionId myxid;
+ uint64 curXactCompletionCount;
TransactionId replication_slot_xmin = InvalidTransactionId;
TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
@@ -2007,12 +2119,19 @@ GetSnapshotData(Snapshot snapshot)
*/
LWLockAcquire(ProcArrayLock, LW_SHARED);
+ if (GetSnapshotDataReuse(snapshot))
+ {
+ LWLockRelease(ProcArrayLock);
+ return snapshot;
+ }
+
latest_completed = ShmemVariableCache->latestCompletedXid;
mypgxactoff = MyProc->pgxactoff;
myxid = other_xids[mypgxactoff];
Assert(myxid == MyProc->xid);
oldestxid = ShmemVariableCache->oldestXid;
+ curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
/* xmax is always latestCompletedXid + 1 */
xmax = XidFromFullTransactionId(latest_completed);
@@ -2266,6 +2385,7 @@ GetSnapshotData(Snapshot snapshot)
snapshot->xcnt = count;
snapshot->subxcnt = subcount;
snapshot->suboverflowed = suboverflowed;
+ snapshot->snapXactCompletionCount = curXactCompletionCount;
snapshot->curcid = GetCurrentCommandId(false);
@@ -2277,26 +2397,7 @@ GetSnapshotData(Snapshot snapshot)
snapshot->regd_count = 0;
snapshot->copied = false;
- if (old_snapshot_threshold < 0)
- {
- /*
- * If not using "snapshot too old" feature, fill related fields with
- * dummy values that don't require any locking.
- */
- snapshot->lsn = InvalidXLogRecPtr;
- snapshot->whenTaken = 0;
- }
- else
- {
- /*
- * Capture the current time and WAL stream location in case this
- * snapshot becomes old enough to need to fall back on the special
- * "old snapshot" logic.
- */
- snapshot->lsn = GetXLogInsertRecPtr();
- snapshot->whenTaken = GetSnapshotCurrentTimestamp();
- MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin);
- }
+ GetSnapshotDataInitOldSnapshot(snapshot);
return snapshot;
}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 20103200952e7..7409de9405925 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -460,6 +460,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
LockRelease(&tag, lockmode, false);
}
+/*
+ * LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
/*
* LockPage
*
@@ -1098,6 +1113,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
tag->locktag_field2,
tag->locktag_field1);
break;
+ case LOCKTAG_DATABASE_FROZEN_IDS:
+ appendStringInfo(buf,
+ _("pg_database.datfrozenxid of database %u"),
+ tag->locktag_field1);
+ break;
case LOCKTAG_PAGE:
appendStringInfo(buf,
_("page %u of relation %u of database %u"),
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6985e8eedfb1..774292fd94277 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,6 @@ MultiXactTruncationLock 41
OldSnapshotTimeMapLock 42
LogicalRepWorkerLock 43
XactTruncationLock 44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock 46
+NotifyQueueTailLock 47
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 9b0c376c8cb5f..6154d2c8c63b8 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1197,6 +1197,28 @@ ProcessUtilitySlow(ParseState *pstate,
secondaryObject,
stmt);
}
+ else if (IsA(stmt, TableLikeClause))
+ {
+ /*
+ * Do delayed processing of LIKE options. This
+ * will result in additional sub-statements for us
+ * to process. We can just tack those onto the
+ * to-do list.
+ */
+ TableLikeClause *like = (TableLikeClause *) stmt;
+ RangeVar *rv = ((CreateStmt *) parsetree)->relation;
+ List *morestmts;
+
+ morestmts = expandTableLikeClause(rv, like);
+ stmts = list_concat(stmts, morestmts);
+
+ /*
+ * We don't need a CCI now, besides which the "l"
+ * list pointer is now possibly invalid, so just
+ * skip the CCI test below.
+ */
+ continue;
+ }
else
{
/*
@@ -1405,6 +1427,7 @@ ProcessUtilitySlow(ParseState *pstate,
IndexStmt *stmt = (IndexStmt *) parsetree;
Oid relid;
LOCKMODE lockmode;
+ bool is_alter_table;
if (stmt->concurrent)
PreventInTransactionBlock(isTopLevel,
@@ -1466,6 +1489,17 @@ ProcessUtilitySlow(ParseState *pstate,
list_free(inheritors);
}
+ /*
+ * If the IndexStmt is already transformed, it must have
+ * come from generateClonedIndexStmt, which in current
+ * usage means it came from expandTableLikeClause rather
+ * than from original parse analysis. And that means we
+ * must treat it like ALTER TABLE ADD INDEX, not CREATE.
+ * (This is a bit grotty, but currently it doesn't seem
+ * worth adding a separate bool field for the purpose.)
+ */
+ is_alter_table = stmt->transformed;
+
/* Run parse analysis ... */
stmt = transformIndexStmt(relid, stmt, queryString);
@@ -1477,7 +1511,7 @@ ProcessUtilitySlow(ParseState *pstate,
InvalidOid, /* no predefined OID */
InvalidOid, /* no parent index */
InvalidOid, /* no parent constraint */
- false, /* is_alter_table */
+ is_alter_table,
true, /* check_rights */
true, /* check_not_in_use */
false, /* skip_build */
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index 5d2aca8cfe6f8..54d5c3794726b 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -57,6 +57,7 @@ OBJS = \
lockfuncs.o \
mac.o \
mac8.o \
+ mcxtfuncs.o \
misc.o \
name.o \
network.o \
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index ffd1ce8c76104..429c9280c0cf7 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -271,18 +271,6 @@ float4in(PG_FUNCTION_ARGS)
errmsg("invalid input syntax for type %s: \"%s\"",
"real", orig_num)));
}
-#ifdef HAVE_BUGGY_SOLARIS_STRTOD
- else
- {
- /*
- * Many versions of Solaris have a bug wherein strtod sets endptr to
- * point one byte beyond the end of the string when given "inf" or
- * "infinity".
- */
- if (endptr != num && endptr[-1] == '\0')
- endptr--;
- }
-#endif /* HAVE_BUGGY_SOLARIS_STRTOD */
/* skip trailing whitespace */
while (*endptr != '\0' && isspace((unsigned char) *endptr))
@@ -499,18 +487,6 @@ float8in_internal_opt_error(char *num, char **endptr_p,
type_name, orig_string))),
have_error);
}
-#ifdef HAVE_BUGGY_SOLARIS_STRTOD
- else
- {
- /*
- * Many versions of Solaris have a bug wherein strtod sets endptr to
- * point one byte beyond the end of the string when given "inf" or
- * "infinity".
- */
- if (endptr != num && endptr[-1] == '\0')
- endptr--;
- }
-#endif /* HAVE_BUGGY_SOLARIS_STRTOD */
/* skip trailing whitespace */
while (*endptr != '\0' && isspace((unsigned char) *endptr))
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index e992d1bbfcedf..f592292d067b8 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -29,6 +29,7 @@
const char *const LockTagTypeNames[] = {
"relation",
"extend",
+ "frozenid",
"page",
"tuple",
"transactionid",
@@ -254,6 +255,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
nulls[8] = true;
nulls[9] = true;
break;
+ case LOCKTAG_DATABASE_FROZEN_IDS:
+ values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
+ nulls[2] = true;
+ nulls[3] = true;
+ nulls[4] = true;
+ nulls[5] = true;
+ nulls[6] = true;
+ nulls[7] = true;
+ nulls[8] = true;
+ nulls[9] = true;
+ break;
case LOCKTAG_PAGE:
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c
new file mode 100644
index 0000000000000..50e1b07ff02c6
--- /dev/null
+++ b/src/backend/utils/adt/mcxtfuncs.c
@@ -0,0 +1,157 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcxtfuncs.c
+ * Functions to show backend memory context.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/mcxtfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+
+/* ----------
+ * The max bytes for showing identifiers of MemoryContext.
+ * ----------
+ */
+#define MEMORY_CONTEXT_IDENT_DISPLAY_SIZE 1024
+
+/*
+ * PutMemoryContextsStatsTupleStore
+ * One recursion level for pg_get_backend_memory_contexts.
+ */
+static void
+PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore,
+ TupleDesc tupdesc, MemoryContext context,
+ const char *parent, int level)
+{
+#define PG_GET_BACKEND_MEMORY_CONTEXTS_COLS 9
+
+ Datum values[PG_GET_BACKEND_MEMORY_CONTEXTS_COLS];
+ bool nulls[PG_GET_BACKEND_MEMORY_CONTEXTS_COLS];
+ MemoryContextCounters stat;
+ MemoryContext child;
+ const char *name;
+ const char *ident;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ name = context->name;
+ ident = context->ident;
+
+ /*
+ * To be consistent with logging output, we label dynahash contexts
+ * with just the hash table name as with MemoryContextStatsPrint().
+ */
+ if (ident && strcmp(name, "dynahash") == 0)
+ {
+ name = ident;
+ ident = NULL;
+ }
+
+ /* Examine the context itself */
+ memset(&stat, 0, sizeof(stat));
+ (*context->methods->stats) (context, NULL, (void *) &level, &stat);
+
+ memset(values, 0, sizeof(values));
+ memset(nulls, 0, sizeof(nulls));
+
+ if (name)
+ values[0] = CStringGetTextDatum(name);
+ else
+ nulls[0] = true;
+
+ if (ident)
+ {
+ int idlen = strlen(ident);
+ char clipped_ident[MEMORY_CONTEXT_IDENT_DISPLAY_SIZE];
+
+ /*
+ * Some identifiers such as SQL query string can be very long,
+ * truncate oversize identifiers.
+ */
+ if (idlen >= MEMORY_CONTEXT_IDENT_DISPLAY_SIZE)
+ idlen = pg_mbcliplen(ident, idlen, MEMORY_CONTEXT_IDENT_DISPLAY_SIZE - 1);
+
+ memcpy(clipped_ident, ident, idlen);
+ clipped_ident[idlen] = '\0';
+ values[1] = CStringGetTextDatum(clipped_ident);
+ }
+ else
+ nulls[1] = true;
+
+ if (parent)
+ values[2] = CStringGetTextDatum(parent);
+ else
+ nulls[2] = true;
+
+ values[3] = Int32GetDatum(level);
+ values[4] = Int64GetDatum(stat.totalspace);
+ values[5] = Int64GetDatum(stat.nblocks);
+ values[6] = Int64GetDatum(stat.freespace);
+ values[7] = Int64GetDatum(stat.freechunks);
+ values[8] = Int64GetDatum(stat.totalspace - stat.freespace);
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+ for (child = context->firstchild; child != NULL; child = child->nextchild)
+ {
+ PutMemoryContextsStatsTupleStore(tupstore, tupdesc,
+ child, name, level + 1);
+ }
+}
+
+/*
+ * pg_get_backend_memory_contexts
+ * SQL SRF showing backend memory context.
+ */
+Datum
+pg_get_backend_memory_contexts(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ PutMemoryContextsStatsTupleStore(tupstore, tupdesc,
+ TopMemoryContext, NULL, 0);
+
+ /* clean up and return the tuplestore */
+ tuplestore_donestoring(tupstore);
+
+ return (Datum) 0;
+}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index a2453cf1f4211..96ecad02ddb19 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1870,7 +1870,7 @@ formrdesc(const char *relationName, Oid relationReltype,
relation->rd_rel->relreplident = REPLICA_IDENTITY_NOTHING;
relation->rd_rel->relpages = 0;
- relation->rd_rel->reltuples = 0;
+ relation->rd_rel->reltuples = -1;
relation->rd_rel->relallvisible = 0;
relation->rd_rel->relkind = RELKIND_RELATION;
relation->rd_rel->relnatts = (int16) natts;
@@ -3692,7 +3692,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
if (relation->rd_rel->relkind != RELKIND_SEQUENCE)
{
classform->relpages = 0; /* it's empty until further notice */
- classform->reltuples = 0;
+ classform->reltuples = -1;
classform->relallvisible = 0;
}
classform->relfrozenxid = freezeXid;
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index abda22fa570a3..88c76f290cea8 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -67,6 +67,7 @@ static void MemoryContextStatsPrint(MemoryContext context, void *passthru,
#define AssertNotInCriticalSection(context) \
Assert(CritSectionCount == 0 || (context)->allowInCritSection)
+
/*****************************************************************************
* EXPORTED ROUTINES *
*****************************************************************************/
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index 7ac3659261e33..f11fe85aeb314 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -16,6 +16,7 @@ override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
OBJS = \
logtape.o \
+ sharedbits.o \
sharedtuplestore.o \
sortsupport.o \
tuplesort.o \
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 5517e59c50fd8..788815cdab6ca 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -78,6 +78,8 @@
#include "postgres.h"
+#include
+
#include "storage/buffile.h"
#include "utils/builtins.h"
#include "utils/logtape.h"
@@ -551,7 +553,7 @@ ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
lt = <s->tapes[i];
pg_itoa(i, filename);
- file = BufFileOpenShared(fileset, filename);
+ file = BufFileOpenShared(fileset, filename, O_RDONLY);
filesize = BufFileSize(file);
/*
diff --git a/src/backend/utils/sort/sharedbits.c b/src/backend/utils/sort/sharedbits.c
new file mode 100644
index 0000000000000..be7000b08cb2f
--- /dev/null
+++ b/src/backend/utils/sort/sharedbits.c
@@ -0,0 +1,288 @@
+#include "postgres.h"
+
+#include
+
+#include "storage/buffile.h"
+#include "utils/sharedbits.h"
+
+/*
+ * TODO: put a comment about not currently supporting parallel scan of the SharedBits
+ * To support parallel scan, need to introduce many more mechanisms
+ */
+
+/* Per-participant shared state */
+struct SharedBitsParticipant
+{
+ bool present;
+ bool writing;
+};
+
+/* Shared control object */
+struct SharedBits
+{
+ int nparticipants; /* Number of participants that can write. */
+ int64 nbits;
+ char name[NAMEDATALEN]; /* A name for this bitstore. */
+
+ SharedBitsParticipant participants[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/* backend-local state */
+struct SharedBitsAccessor
+{
+ int participant;
+ SharedBits *bits;
+ SharedFileSet *fileset;
+ BufFile *write_file;
+ BufFile *combined;
+};
+
+SharedBitsAccessor *
+sb_attach(SharedBits *sbits, int my_participant_number, SharedFileSet *fileset)
+{
+ SharedBitsAccessor *accessor = palloc0(sizeof(SharedBitsAccessor));
+
+ accessor->participant = my_participant_number;
+ accessor->bits = sbits;
+ accessor->fileset = fileset;
+ accessor->write_file = NULL;
+ accessor->combined = NULL;
+ return accessor;
+}
+
+SharedBitsAccessor *
+sb_initialize(SharedBits *sbits,
+ int participants,
+ int my_participant_number,
+ SharedFileSet *fileset,
+ char *name)
+{
+ SharedBitsAccessor *accessor;
+
+ sbits->nparticipants = participants;
+ strcpy(sbits->name, name);
+ sbits->nbits = 0; /* TODO: maybe delete this */
+
+ accessor = palloc0(sizeof(SharedBitsAccessor));
+ accessor->participant = my_participant_number;
+ accessor->bits = sbits;
+ accessor->fileset = fileset;
+ accessor->write_file = NULL;
+ accessor->combined = NULL;
+ return accessor;
+}
+
+/* TODO: is "initialize_accessor" a clear enough API for this? (making the file)? */
+void
+sb_initialize_accessor(SharedBitsAccessor *accessor, uint32 nbits)
+{
+ char name[MAXPGPATH];
+ uint32 num_to_write;
+
+ snprintf(name, MAXPGPATH, "%s.p%d.bitmap", accessor->bits->name, accessor->participant);
+
+ accessor->write_file =
+ BufFileCreateShared(accessor->fileset, name);
+
+ accessor->bits->participants[accessor->participant].present = true;
+ /* TODO: check this math. tuplenumber will be too high? */
+ num_to_write = nbits / 8 + 1;
+
+ /*
+ * TODO: add tests that could exercise a problem with junk being written
+ * to bitmap
+ */
+
+ /*
+ * TODO: is there a better way to write the bytes to the file without
+ * calling BufFileWrite() like this? palloc()ing an undetermined number of
+ * bytes feels like it is against the spirit of this patch to begin with,
+ * but the many function calls seem expensive
+ */
+ for (int i = 0; i < num_to_write; i++)
+ {
+ unsigned char byteToWrite = 0;
+
+ BufFileWrite(accessor->write_file, &byteToWrite, 1);
+ }
+
+ if (BufFileSeek(accessor->write_file, 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file: %m")));
+}
+
+size_t
+sb_estimate(int participants)
+{
+ return offsetof(SharedBits, participants) + participants * sizeof(SharedBitsParticipant);
+}
+
+
+void
+sb_setbit(SharedBitsAccessor *accessor, uint64 bit)
+{
+ SharedBitsParticipant *const participant =
+ &accessor->bits->participants[accessor->participant];
+
+ /* TODO: use an unsigned int instead of a byte */
+ unsigned char current_outer_byte;
+
+ Assert(accessor->write_file);
+
+ if (!participant->writing)
+ {
+ participant->writing = true;
+ }
+
+ BufFileSeek(accessor->write_file, 0, bit / 8, SEEK_SET);
+ BufFileRead(accessor->write_file, ¤t_outer_byte, 1);
+
+ current_outer_byte |= 1U << (bit % 8);
+
+ BufFileSeek(accessor->write_file, 0, -1, SEEK_CUR);
+ BufFileWrite(accessor->write_file, ¤t_outer_byte, 1);
+}
+
+bool
+sb_checkbit(SharedBitsAccessor *accessor, uint32 n)
+{
+ bool match;
+ uint32 bytenum = n / 8;
+ unsigned char bit = n % 8;
+ unsigned char byte_to_check = 0;
+
+ Assert(accessor->combined);
+
+ /* seek to byte to check */
+ if (BufFileSeek(accessor->combined,
+ 0,
+ bytenum,
+ SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg(
+ "could not rewind shared outer temporary file: %m")));
+ /* read byte containing ntuple bit */
+ if (BufFileRead(accessor->combined, &byte_to_check, 1) == 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg(
+ "could not read byte in outer match status bitmap: %m.")));
+ /* if bit is set */
+ match = ((byte_to_check) >> bit) & 1;
+
+ return match;
+}
+
+BufFile *
+sb_combine(SharedBitsAccessor *accessor)
+{
+ /*
+ * TODO: this tries to close an outer match status file for each
+ * participant in the tuplestore. technically, only participants in the
+ * barrier could have outer match status files, however, all but one
+ * participant continue on and detach from the barrier so we won't have a
+ * reliable way to close only files for those attached to the barrier
+ */
+ BufFile **statuses;
+ BufFile *combined_bitmap_file;
+ int statuses_length;
+
+ int nbparticipants = 0;
+
+ for (int l = 0; l < accessor->bits->nparticipants; l++)
+ {
+ SharedBitsParticipant participant = accessor->bits->participants[l];
+
+ if (participant.present)
+ {
+ Assert(!participant.writing);
+ nbparticipants++;
+ }
+ }
+ statuses = palloc(sizeof(BufFile *) * nbparticipants);
+
+ /*
+ * Open the bitmap shared BufFile from each participant. TODO: explain why
+ * file can be NULLs
+ */
+ statuses_length = 0;
+
+ for (int i = 0; i < accessor->bits->nparticipants; i++)
+ {
+ char bitmap_filename[MAXPGPATH];
+ BufFile *file;
+
+ /* TODO: make a function that will do this */
+ snprintf(bitmap_filename, MAXPGPATH, "%s.p%d.bitmap", accessor->bits->name, i);
+
+ if (!accessor->bits->participants[i].present)
+ continue;
+ file = BufFileOpenShared(accessor->fileset, bitmap_filename, O_RDWR);
+ /* TODO: can we be sure that this file is at beginning? */
+ Assert(file);
+
+ statuses[statuses_length++] = file;
+ }
+
+ combined_bitmap_file = BufFileCreateTemp(false);
+
+ for (int64 cur = 0; cur < BufFileSize(statuses[0]); cur++) /* make it while not EOF */
+ {
+ /*
+ * TODO: make this use an unsigned int instead of a byte so it isn't
+ * so slow
+ */
+ unsigned char combined_byte = 0;
+
+ for (int i = 0; i < statuses_length; i++)
+ {
+ unsigned char read_byte;
+
+ BufFileRead(statuses[i], &read_byte, 1);
+ combined_byte |= read_byte;
+ }
+
+ BufFileWrite(combined_bitmap_file, &combined_byte, 1);
+ }
+
+ if (BufFileSeek(combined_bitmap_file, 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file: %m")));
+
+ for (int i = 0; i < statuses_length; i++)
+ BufFileClose(statuses[i]);
+ pfree(statuses);
+
+ accessor->combined = combined_bitmap_file;
+ return combined_bitmap_file;
+}
+
+void
+sb_end_write(SharedBitsAccessor *sba)
+{
+ SharedBitsParticipant
+ *const participant = &sba->bits->participants[sba->participant];
+
+ participant->writing = false;
+
+ /*
+ * TODO: this should not be needed if flow is correct. need to fix that
+ * and get rid of this check
+ */
+ if (sba->write_file)
+ BufFileClose(sba->write_file);
+ sba->write_file = NULL;
+}
+
+void
+sb_end_read(SharedBitsAccessor *accessor)
+{
+ if (accessor->combined == NULL)
+ return;
+
+ BufFileClose(accessor->combined);
+ accessor->combined = NULL;
+}
diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c
index 6537a4303b125..cb5d9506760b7 100644
--- a/src/backend/utils/sort/sharedtuplestore.c
+++ b/src/backend/utils/sort/sharedtuplestore.c
@@ -47,19 +47,28 @@ typedef struct SharedTuplestoreChunk
char data[FLEXIBLE_ARRAY_MEMBER];
} SharedTuplestoreChunk;
+typedef enum SharedTuplestoreMode
+{
+ WRITING = 0,
+ READING = 1,
+ APPENDING = 2
+} SharedTuplestoreMode;
+
/* Per-participant shared state. */
typedef struct SharedTuplestoreParticipant
{
LWLock lock;
BlockNumber read_page; /* Page number for next read. */
+ bool rewound;
BlockNumber npages; /* Number of pages written. */
- bool writing; /* Used only for assertions. */
+ SharedTuplestoreMode mode; /* Used only for assertions. */
} SharedTuplestoreParticipant;
/* The control object that lives in shared memory. */
struct SharedTuplestore
{
int nparticipants; /* Number of participants that can write. */
+ pg_atomic_uint32 ntuples; /* Number of tuples in this tuplestore. */
int flags; /* Flag bits from SHARED_TUPLESTORE_XXX */
size_t meta_data_size; /* Size of per-tuple header. */
char name[NAMEDATALEN]; /* A name for this tuplestore. */
@@ -92,6 +101,8 @@ struct SharedTuplestoreAccessor
BlockNumber write_page; /* The next page to write to. */
char *write_pointer; /* Current write pointer within chunk. */
char *write_end; /* One past the end of the current chunk. */
+ bool participated; /* Did the worker participate in writing this
+ * STS at any point */
};
static void sts_filename(char *name, SharedTuplestoreAccessor *accessor,
@@ -137,6 +148,7 @@ sts_initialize(SharedTuplestore *sts, int participants,
Assert(my_participant_number < participants);
sts->nparticipants = participants;
+ pg_atomic_init_u32(&sts->ntuples, 1);
sts->meta_data_size = meta_data_size;
sts->flags = flags;
@@ -158,7 +170,8 @@ sts_initialize(SharedTuplestore *sts, int participants,
LWLockInitialize(&sts->participants[i].lock,
LWTRANCHE_SHARED_TUPLESTORE);
sts->participants[i].read_page = 0;
- sts->participants[i].writing = false;
+ sts->participants[i].rewound = false;
+ sts->participants[i].mode = READING;
}
accessor = palloc0(sizeof(SharedTuplestoreAccessor));
@@ -188,6 +201,7 @@ sts_attach(SharedTuplestore *sts,
accessor->sts = sts;
accessor->fileset = fileset;
accessor->context = CurrentMemoryContext;
+ accessor->participated = false;
return accessor;
}
@@ -219,7 +233,9 @@ sts_end_write(SharedTuplestoreAccessor *accessor)
pfree(accessor->write_chunk);
accessor->write_chunk = NULL;
accessor->write_file = NULL;
- accessor->sts->participants[accessor->participant].writing = false;
+ accessor->write_pointer = NULL;
+ accessor->write_end = NULL;
+ accessor->sts->participants[accessor->participant].mode = READING;
}
}
@@ -263,7 +279,7 @@ sts_begin_parallel_scan(SharedTuplestoreAccessor *accessor)
* files have stopped growing.
*/
for (i = 0; i < accessor->sts->nparticipants; ++i)
- Assert(!accessor->sts->participants[i].writing);
+ Assert((accessor->sts->participants[i].mode == READING) || (accessor->sts->participants[i].mode == APPENDING));
/*
* We will start out reading the file that THIS backend wrote. There may
@@ -311,10 +327,11 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data,
/* Create one. Only this backend will write into it. */
sts_filename(name, accessor, accessor->participant);
accessor->write_file = BufFileCreateShared(accessor->fileset, name);
+ accessor->participated = true;
/* Set up the shared state for this backend's file. */
participant = &accessor->sts->participants[accessor->participant];
- participant->writing = true; /* for assertions only */
+ participant->mode = WRITING; /* for assertions only */
}
/* Do we have space? */
@@ -513,6 +530,17 @@ sts_read_tuple(SharedTuplestoreAccessor *accessor, void *meta_data)
return tuple;
}
+MinimalTuple
+sts_parallel_scan_chunk(SharedTuplestoreAccessor *accessor,
+ void *meta_data,
+ bool inner)
+{
+ Assert(accessor->read_file);
+ if (accessor->read_ntuples < accessor->read_ntuples_available)
+ return sts_read_tuple(accessor, meta_data);
+ return NULL;
+}
+
/*
* Get the next tuple in the current parallel scan.
*/
@@ -526,7 +554,13 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
for (;;)
{
/* Can we read more tuples from the current chunk? */
- if (accessor->read_ntuples < accessor->read_ntuples_available)
+ /*
+ * Added a check for accessor->read_file being present here, as it
+ * became relevant for adaptive hashjoin. TODO: Not sure if this has
+ * other consequences for correctness
+ */
+
+ if (accessor->read_ntuples < accessor->read_ntuples_available && accessor->read_file)
return sts_read_tuple(accessor, meta_data);
/* Find the location of a new chunk to read. */
@@ -559,7 +593,7 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
sts_filename(name, accessor, accessor->read_participant);
accessor->read_file =
- BufFileOpenShared(accessor->fileset, name);
+ BufFileOpenShared(accessor->fileset, name, O_RDONLY);
}
/* Seek and load the chunk header. */
@@ -618,6 +652,56 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data)
return NULL;
}
+uint32
+sts_increment_ntuples(SharedTuplestoreAccessor *accessor)
+{
+ return pg_atomic_fetch_add_u32(&accessor->sts->ntuples, 1);
+}
+
+uint32
+sts_get_tuplenum(SharedTuplestoreAccessor *accessor)
+{
+ return pg_atomic_read_u32(&accessor->sts->ntuples);
+}
+
+int
+sta_get_read_participant(SharedTuplestoreAccessor *accessor)
+{
+ return accessor->read_participant;
+}
+
+void
+sts_spill_leftover_tuples(SharedTuplestoreAccessor *accessor, MinimalTuple tuple, uint32 hashvalue)
+{
+ tupleMetadata metadata;
+ SharedTuplestoreParticipant *participant;
+ char name[MAXPGPATH];
+
+ metadata.hashvalue = hashvalue;
+ participant = &accessor->sts->participants[accessor->participant];
+ participant->mode = APPENDING; /* for assertions only */
+
+ sts_filename(name, accessor, accessor->participant);
+ if (!accessor->participated)
+ {
+ accessor->write_file = BufFileCreateShared(accessor->fileset, name);
+ accessor->participated = true;
+ }
+
+ else
+ accessor->write_file = BufFileOpenShared(accessor->fileset, name, O_WRONLY);
+
+ BufFileSeek(accessor->write_file, 0, -1, SEEK_END);
+ do
+ {
+ sts_puttuple(accessor, &metadata, tuple);
+ } while ((tuple = sts_parallel_scan_chunk(accessor, &metadata, true)));
+
+ accessor->read_ntuples = 0;
+ accessor->read_ntuples_available = 0;
+ sts_end_write(accessor);
+}
+
/*
* Create the name used for the BufFile that a given participant will write.
*/
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 752af0c10dfc0..22cf3ebaf4728 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -53,6 +53,7 @@
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
+#include "datatype/timestamp.h"
#include "lib/pairingheap.h"
#include "miscadmin.h"
#include "storage/predicate.h"
@@ -67,6 +68,7 @@
#include "utils/resowner_private.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
+#include "utils/timestamp.h"
/*
@@ -595,6 +597,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid,
CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
/* NB: curcid should NOT be copied, it's a local matter */
+ CurrentSnapshot->snapXactCompletionCount = 0;
+
/*
* Now we have to fix what GetSnapshotData did with MyProc->xmin and
* TransactionXmin. There is a race condition: to make sure we are not
@@ -670,6 +674,7 @@ CopySnapshot(Snapshot snapshot)
newsnap->regd_count = 0;
newsnap->active_count = 0;
newsnap->copied = true;
+ newsnap->snapXactCompletionCount = 0;
/* setup XID array */
if (snapshot->xcnt > 0)
@@ -2207,6 +2212,7 @@ RestoreSnapshot(char *start_address)
snapshot->curcid = serialized_snapshot.curcid;
snapshot->whenTaken = serialized_snapshot.whenTaken;
snapshot->lsn = serialized_snapshot.lsn;
+ snapshot->snapXactCompletionCount = 0;
/* Copy XIDs, if present. */
if (serialized_snapshot.xcnt > 0)
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 4f29671d0cdc8..7a5d4562f9461 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -188,7 +188,8 @@ static PQExpBuffer recoveryconfcontents = NULL;
/* Function headers */
static void usage(void);
static void verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found);
-static void progress_report(int tablespacenum, const char *filename, bool force);
+static void progress_report(int tablespacenum, const char *filename, bool force,
+ bool finished);
static void ReceiveTarFile(PGconn *conn, PGresult *res, int rownum);
static void ReceiveTarCopyChunk(size_t r, char *copybuf, void *callback_data);
@@ -765,11 +766,15 @@ verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found)
* Print a progress report based on the global variables. If verbose output
* is enabled, also print the current file name.
*
- * Progress report is written at maximum once per second, unless the
- * force parameter is set to true.
+ * Progress report is written at maximum once per second, unless the force
+ * parameter is set to true.
+ *
+ * If finished is set to true, this is the last progress report. The cursor
+ * is moved to the next line.
*/
static void
-progress_report(int tablespacenum, const char *filename, bool force)
+progress_report(int tablespacenum, const char *filename,
+ bool force, bool finished)
{
int percent;
char totaldone_str[32];
@@ -780,7 +785,7 @@ progress_report(int tablespacenum, const char *filename, bool force)
return;
now = time(NULL);
- if (now == last_progress_report && !force)
+ if (now == last_progress_report && !force && !finished)
return; /* Max once per second */
last_progress_report = now;
@@ -851,10 +856,11 @@ progress_report(int tablespacenum, const char *filename, bool force)
totaldone_str, totalsize_str, percent,
tablespacenum, tablespacecount);
- if (isatty(fileno(stderr)))
- fprintf(stderr, "\r");
- else
- fprintf(stderr, "\n");
+ /*
+ * Stay on the same line if reporting to a terminal and we're not done
+ * yet.
+ */
+ fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
}
static int32
@@ -1277,7 +1283,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
}
}
- progress_report(rownum, state.filename, true);
+ progress_report(rownum, state.filename, true, false);
/*
* Do not sync the resulting tar file yet, all files are synced once at
@@ -1470,7 +1476,7 @@ ReceiveTarCopyChunk(size_t r, char *copybuf, void *callback_data)
}
}
totaldone += r;
- progress_report(state->tablespacenum, state->filename, false);
+ progress_report(state->tablespacenum, state->filename, false, false);
}
@@ -1528,7 +1534,7 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum)
if (state.file)
fclose(state.file);
- progress_report(rownum, state.filename, true);
+ progress_report(rownum, state.filename, true, false);
if (state.file != NULL)
{
@@ -1709,7 +1715,7 @@ ReceiveTarAndUnpackCopyChunk(size_t r, char *copybuf, void *callback_data)
exit(1);
}
totaldone += r;
- progress_report(state->tablespacenum, state->filename, false);
+ progress_report(state->tablespacenum, state->filename, false, false);
state->current_len_left -= r;
if (state->current_len_left == 0 && state->current_padding == 0)
@@ -2027,11 +2033,7 @@ BaseBackup(void)
ReceiveBackupManifest(conn);
if (showprogress)
- {
- progress_report(PQntuples(res), NULL, true);
- if (isatty(fileno(stderr)))
- fprintf(stderr, "\n"); /* Need to move to next line */
- }
+ progress_report(PQntuples(res), NULL, true, true);
PQclear(res);
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 1daa5aed0e0fd..ffdc23945c6dc 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -125,7 +125,7 @@ static const struct exclude_list_item skip[] = {
* src/bin/pg_basebackup/pg_basebackup.c.
*/
static void
-progress_report(bool force)
+progress_report(bool finished)
{
int percent;
char total_size_str[32];
@@ -135,7 +135,7 @@ progress_report(bool force)
Assert(showprogress);
now = time(NULL);
- if (now == last_progress_report && !force)
+ if (now == last_progress_report && !finished)
return; /* Max once per second */
/* Save current time */
@@ -162,8 +162,11 @@ progress_report(bool force)
(int) strlen(current_size_str), current_size_str, total_size_str,
percent);
- /* Stay on the same line if reporting to a terminal */
- fprintf(stderr, isatty(fileno(stderr)) ? "\r" : "\n");
+ /*
+ * Stay on the same line if reporting to a terminal and we're not done
+ * yet.
+ */
+ fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
}
static bool
@@ -624,10 +627,7 @@ main(int argc, char *argv[])
(void) scan_directory(DataDir, "pg_tblspc", false);
if (showprogress)
- {
progress_report(true);
- fprintf(stderr, "\n"); /* Need to move to next line */
- }
printf(_("Checksum operation completed\n"));
printf(_("Files scanned: %s\n"), psprintf(INT64_FORMAT, files));
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 2325fb5d30216..2229c86f9afbc 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -14,6 +14,7 @@
#include
#include "access/rmgr.h"
+#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/xlogreader.h"
#include "catalog/pg_control.h"
@@ -397,6 +398,18 @@ extractPageInfo(XLogReaderState *record)
* source system.
*/
}
+ else if (rmid == RM_XACT_ID &&
+ ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
+ (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED ||
+ (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT ||
+ (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT_PREPARED))
+ {
+ /*
+ * These records can include "dropped rels". We can safely ignore
+ * them, we will see that they are missing and copy them from the
+ * source.
+ */
+ }
else if (info & XLR_SPECIAL_REL_UPDATE)
{
/*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0015d3b461a71..23fc749e44515 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -422,7 +422,6 @@ main(int argc, char **argv)
executeFileMap();
progress_report(true);
- printf("\n");
if (showprogress)
pg_log_info("creating backup label and updating control file");
@@ -519,11 +518,14 @@ sanityChecks(void)
/*
* Print a progress report based on the fetch_size and fetch_done variables.
*
- * Progress report is written at maximum once per second, unless the
- * force parameter is set to true.
+ * Progress report is written at maximum once per second, except that the
+ * last progress report is always printed.
+ *
+ * If finished is set to true, this is the last progress report. The cursor
+ * is moved to the next line.
*/
void
-progress_report(bool force)
+progress_report(bool finished)
{
static pg_time_t last_progress_report = 0;
int percent;
@@ -535,7 +537,7 @@ progress_report(bool force)
return;
now = time(NULL);
- if (now == last_progress_report && !force)
+ if (now == last_progress_report && !finished)
return; /* Max once per second */
last_progress_report = now;
@@ -565,10 +567,12 @@ progress_report(bool force)
fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
(int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
percent);
- if (isatty(fileno(stderr)))
- fprintf(stderr, "\r");
- else
- fprintf(stderr, "\n");
+
+ /*
+ * Stay on the same line if reporting to a terminal and we're not done
+ * yet.
+ */
+ fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
}
/*
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 5cf5f17bb5f1a..8a9319ed67597 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -53,7 +53,7 @@ extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
int tliIndex, const char *restoreCommand);
/* in pg_rewind.c */
-extern void progress_report(bool force);
+extern void progress_report(bool finished);
/* in timeline.c */
extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,
diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl
index fb4a0acd965af..ba528e262f32d 100644
--- a/src/bin/pg_rewind/t/001_basic.pl
+++ b/src/bin/pg_rewind/t/001_basic.pl
@@ -1,7 +1,7 @@
use strict;
use warnings;
use TestLib;
-use Test::More tests => 20;
+use Test::More tests => 23;
use FindBin;
use lib $FindBin::RealBin;
@@ -29,6 +29,10 @@ sub run_test
primary_psql("CREATE TABLE tail_tbl (id integer, d text)");
primary_psql("INSERT INTO tail_tbl VALUES (0, 'in primary')");
+ # This test table is dropped in the old primary after promotion.
+ primary_psql("CREATE TABLE drop_tbl (d text)");
+ primary_psql("INSERT INTO drop_tbl VALUES ('in primary')");
+
primary_psql("CHECKPOINT");
RewindTest::create_standby($test_mode);
@@ -66,6 +70,9 @@ sub run_test
primary_psql("DELETE FROM tail_tbl WHERE id > 10");
primary_psql("VACUUM tail_tbl");
+ # Drop drop_tbl. pg_rewind should copy it back.
+ primary_psql("DROP TABLE drop_tbl");
+
# Before running pg_rewind, do a couple of extra tests with several
# option combinations. As the code paths taken by those tests
# do not change for the "local" and "remote" modes, just run them
@@ -154,6 +161,12 @@ sub run_test
),
'tail-copy');
+ check_query(
+ 'SELECT * FROM drop_tbl',
+ qq(in primary
+),
+ 'drop');
+
# Permissions on PGDATA should be default
SKIP:
{
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index d1a0678935397..31e99c2a6da5d 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -611,14 +611,9 @@ XLogDumpDisplayStats(XLogDumpConfig *config, XLogDumpStats *stats)
double rec_len_pct,
fpi_len_pct;
- /* ---
- * Make a first pass to calculate column totals:
- * count(*),
- * sum(xl_len+SizeOfXLogRecord),
- * sum(xl_tot_len-xl_len-SizeOfXLogRecord), and
- * sum(xl_tot_len).
- * These are used to calculate percentages for each record type.
- * ---
+ /*
+ * Each row shows its percentages of the total, so make a first pass to
+ * calculate column totals.
*/
for (ri = 0; ri < RM_NEXT_ID; ri++)
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 931257bd8172f..68d90f5141d61 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -38,8 +38,8 @@ typedef struct IndexBuildResult
*
* num_heap_tuples is accurate only when estimated_count is false;
* otherwise it's just an estimate (currently, the estimate is the
- * prior value of the relation's pg_class.reltuples field). It will
- * always just be an estimate during ambulkdelete.
+ * prior value of the relation's pg_class.reltuples field, so it could
+ * even be -1). It will always just be an estimate during ambulkdelete.
*/
typedef struct IndexVacuumInfo
{
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index ba77013f64f27..92b19dba324fb 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -178,7 +178,8 @@ extern int heap_page_prune(Relation relation, Buffer buffer,
struct GlobalVisState *vistest,
TransactionId limited_oldest_xmin,
TimestampTz limited_oldest_ts,
- bool report_stats, TransactionId *latestRemovedXid);
+ bool report_stats, TransactionId *latestRemovedXid,
+ OffsetNumber *off_loc);
extern void heap_page_prune_execute(Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index aa17f7df84d4b..15251941128a4 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -137,8 +137,6 @@ typedef struct xl_heap_truncate
* or updated tuple in WAL; we can save a few bytes by reconstructing the
* fields that are available elsewhere in the WAL record, or perhaps just
* plain needn't be reconstructed. These are the fields we must store.
- * NOTE: t_hoff could be recomputed, but we may as well store it because
- * it will come for free due to alignment considerations.
*/
typedef struct xl_heap_header
{
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index b32044153b09d..2f1f144db4d06 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -231,6 +231,15 @@ typedef struct VariableCacheData
FullTransactionId latestCompletedXid; /* newest full XID that has
* committed or aborted */
+ /*
+ * Number of top-level transactions with xids (i.e. which may have
+ * modified the database) that completed in some form since the start of
+ * the server. This currently is solely used to check whether
+ * GetSnapshotData() needs to recompute the contents of the snapshot, or
+ * not. There are likely other users of this. Always above 1.
+ */
+ uint64 xactCompletionCount;
+
/*
* These fields are protected by XactTruncationLock
*/
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index c18554bae2c25..df1b43a932e3d 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -16,11 +16,11 @@
#include "access/transam.h"
#include "access/xlogreader.h"
+#include "datatype/timestamp.h"
#include "lib/stringinfo.h"
#include "nodes/pg_list.h"
#include "storage/relfilenode.h"
#include "storage/sinval.h"
-#include "utils/datetime.h"
/*
* Maximum size of Global Transaction ID (including '\0').
@@ -82,8 +82,8 @@ typedef enum
extern int synchronous_commit;
/* used during logical streaming of a transaction */
-extern TransactionId CheckXidAlive;
-extern bool bsysscan;
+extern PGDLLIMPORT TransactionId CheckXidAlive;
+extern PGDLLIMPORT bool bsysscan;
/*
* Miscellaneous flag bits to record events which occur on the top level
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 9b2da56379e15..4146753d4765d 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -43,11 +43,8 @@ typedef struct XLogPageHeaderData
/*
* When there is not enough space on current page for whole record, we
* continue on the next page. xlp_rem_len is the number of bytes
- * remaining from a previous page.
- *
- * Note that xlp_rem_len includes backup-block data; that is, it tracks
- * xl_tot_len not xl_len in the initial header. Also note that the
- * continuation data isn't necessarily aligned.
+ * remaining from a previous page; it tracks xl_tot_len in the initial
+ * header. Note that the continuation data isn't necessarily aligned.
*/
uint32 xlp_rem_len; /* total len of remaining data for record */
} XLogPageHeaderData;
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 928495112196a..52ca61f8a8e83 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202007251
+#define CATALOG_VERSION_NO 202008301
#endif
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index 78b33b2a7f9b8..679eec34439b6 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -62,8 +62,8 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat
/* # of blocks (not always up-to-date) */
int32 relpages BKI_DEFAULT(0);
- /* # of tuples (not always up-to-date) */
- float4 reltuples BKI_DEFAULT(0);
+ /* # of tuples (not always up-to-date; -1 means "unknown") */
+ float4 reltuples BKI_DEFAULT(-1);
/* # of all-visible blocks (not always up-to-date) */
int32 relallvisible BKI_DEFAULT(0);
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index 5b0e063655d33..4f8b9865effc4 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -218,10 +218,10 @@
oprname => '>=', oprleft => 'xid8', oprright => 'xid8', oprresult => 'bool',
oprcom => '<=(xid8,xid8)', oprnegate => '<(xid8,xid8)', oprcode => 'xid8ge',
oprrest => 'scalargesel', oprjoin => 'scalargejoinsel' },
-{ oid => '388', descr => 'factorial',
+{ oid => '388', descr => 'deprecated, use factorial() instead',
oprname => '!', oprkind => 'r', oprleft => 'int8', oprright => '0',
oprresult => 'numeric', oprcode => 'numeric_fac' },
-{ oid => '389', descr => 'deprecated, use ! instead',
+{ oid => '389', descr => 'deprecated, use factorial() instead',
oprname => '!!', oprkind => 'l', oprleft => '0', oprright => 'int8',
oprresult => 'numeric', oprcode => 'numeric_fac' },
{ oid => '385', descr => 'equal',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 082a11f2708c6..1dd325e0e6fdc 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -328,6 +328,7 @@
proname => 'unknownout', prorettype => 'cstring', proargtypes => 'unknown',
prosrc => 'unknownout' },
{ oid => '111',
+ descr => 'implementation of deprecated ! and !! factorial operators',
proname => 'numeric_fac', prorettype => 'numeric', proargtypes => 'int8',
prosrc => 'numeric_fac' },
@@ -7807,6 +7808,15 @@
proargnames => '{name,off,size,allocated_size}',
prosrc => 'pg_get_shmem_allocations' },
+# memory context of local backend
+{ oid => '2282', descr => 'information about all memory contexts of local backend',
+ proname => 'pg_get_backend_memory_contexts', prorows => '100', proretset => 't',
+ provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{text,text,text,int4,int8,int8,int8,int8,int8}',
+ proargmodes => '{o,o,o,o,o,o,o,o,o}',
+ proargnames => '{name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes}',
+ prosrc => 'pg_get_backend_memory_contexts' },
+
# non-persistent series generator
{ oid => '1066', descr => 'non-persistent series generator',
proname => 'generate_series', prorows => '1000',
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index ba661d32a6309..0ba9d856c8384 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -46,6 +46,7 @@ typedef struct ExplainState
bool timing; /* print detailed node timing */
bool summary; /* print total planning and execution timing */
bool settings; /* print modified settings */
+ bool usage; /* print memory usage */
ExplainFormat format; /* output format */
/* state for output formatting --- not reset for each new plan tree */
int indent; /* current indentation level */
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index eb5daba36b0ff..e9354cc6e05c0 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -19,6 +19,7 @@
#include "storage/barrier.h"
#include "storage/buffile.h"
#include "storage/lwlock.h"
+#include "utils/sharedbits.h"
/* ----------------------------------------------------------------
* hash-join hash table structures
@@ -142,6 +143,17 @@ typedef struct HashMemoryChunkData *HashMemoryChunk;
/* tuples exceeding HASH_CHUNK_THRESHOLD bytes are put in their own chunk */
#define HASH_CHUNK_THRESHOLD (HASH_CHUNK_SIZE / 4)
+/*
+ * HashJoinTableData->curstripe the current stripe number
+ * The phantom stripe refers to the state of the inner side hashtable (empty)
+ * during the final scan of the outer batch file for a batch being processed
+ * using the hashloop fallback algorithm.
+ * In parallel-aware hash join, curstripe is in a detached state
+ * when the worker is not attached to the stripe_barrier.
+ */
+#define PHANTOM_STRIPE -2
+#define STRIPE_DETACHED -1
+
/*
* For each batch of a Parallel Hash Join, we have a ParallelHashJoinBatch
* object in shared memory to coordinate access to it. Since they are
@@ -152,14 +164,34 @@ typedef struct ParallelHashJoinBatch
{
dsa_pointer buckets; /* array of hash table buckets */
Barrier batch_barrier; /* synchronization for joining this batch */
+ Barrier stripe_barrier; /* synchronization for stripes */
dsa_pointer chunks; /* chunks of tuples loaded */
size_t size; /* size of buckets + chunks in memory */
size_t estimated_size; /* size of buckets + chunks while writing */
- size_t ntuples; /* number of tuples loaded */
+ /* total number of tuples loaded into batch (in memory and spill files) */
+ size_t ntuples;
size_t old_ntuples; /* number of tuples before repartitioning */
bool space_exhausted;
+ /* Adaptive HashJoin */
+
+ /*
+ * after finishing build phase, hashloop_fallback cannot change, and does
+ * not require a lock to read
+ */
+ pg_atomic_flag overflow_required;
+ bool hashloop_fallback;
+ int nstripes; /* the number of stripes in the batch */
+ /* number of tuples loaded into the hashtable */
+ pg_atomic_uint64 ntuples_in_memory;
+
+ /*
+ * Note that ntuples will reflect the total number of tuples in the batch
+ * while ntuples_in_memory will reflect how many tuples are in memory
+ */
+ LWLock lock;
+
/*
* Variable-sized SharedTuplestore objects follow this struct in memory.
* See the accessor macros below.
@@ -177,10 +209,17 @@ typedef struct ParallelHashJoinBatch
((char *) ParallelHashJoinBatchInner(batch) + \
MAXALIGN(sts_estimate(nparticipants))))
+/* Accessor for sharedbits following a ParallelHashJoinBatch. */
+#define ParallelHashJoinBatchOuterBits(batch, nparticipants) \
+ ((SharedBits *) \
+ ((char *) ParallelHashJoinBatchOuter(batch, nparticipants) + \
+ MAXALIGN(sts_estimate(nparticipants))))
+
/* Total size of a ParallelHashJoinBatch and tuplestores. */
#define EstimateParallelHashJoinBatch(hashtable) \
(MAXALIGN(sizeof(ParallelHashJoinBatch)) + \
- MAXALIGN(sts_estimate((hashtable)->parallel_state->nparticipants)) * 2)
+ MAXALIGN(sts_estimate((hashtable)->parallel_state->nparticipants)) * 2 + \
+ MAXALIGN(sb_estimate((hashtable)->parallel_state->nparticipants)))
/* Accessor for the nth ParallelHashJoinBatch given the base. */
#define NthParallelHashJoinBatch(base, n) \
@@ -204,9 +243,19 @@ typedef struct ParallelHashJoinBatchAccessor
size_t old_ntuples; /* how many tuples before repartitioning? */
bool at_least_one_chunk; /* has this backend allocated a chunk? */
- bool done; /* flag to remember that a batch is done */
+ int done; /* flag to remember that a batch is done */
+ /* -1 for not done, 0 for tentatively done, 1 for done */
SharedTuplestoreAccessor *inner_tuples;
SharedTuplestoreAccessor *outer_tuples;
+ SharedBitsAccessor *sba;
+
+ /*
+ * All participants except the last worker working on a batch which has
+ * fallen back to hashloop processing save the stripe barrier phase and
+ * detach to avoid the deadlock hazard of waiting on a barrier after
+ * tuples have been emitted.
+ */
+ int last_participating_stripe_phase;
} ParallelHashJoinBatchAccessor;
/*
@@ -223,10 +272,28 @@ typedef enum ParallelHashGrowth
PHJ_GROWTH_NEED_MORE_BUCKETS,
/* The memory budget would be exhausted, so we need to repartition. */
PHJ_GROWTH_NEED_MORE_BATCHES,
- /* Repartitioning didn't help last time, so don't try to do that again. */
- PHJ_GROWTH_DISABLED
+
+ /*
+ * While repartitioning or, if nbatches would overflow int, disable growth
+ * in the number of batches
+ */
+ PHJ_GROWTH_DISABLED,
+ PHJ_GROWTH_SPILL_BATCH0,
+ PHJ_GROWTH_LOADING
} ParallelHashGrowth;
+typedef enum ParallelHashJoinBatchAccessorStatus
+{
+ /* No more useful work can be done on this batch by this worker */
+ PHJ_BATCH_ACCESSOR_DONE,
+
+ /*
+ * The worker has not yet checked this batch to see if it can do useful
+ * work
+ */
+ PHJ_BATCH_ACCESSOR_NOT_DONE
+} ParallelHashJoinBatchAccessorStatus;
+
/*
* The shared state used to coordinate a Parallel Hash Join. This is stored
* in the DSM segment.
@@ -246,6 +313,8 @@ typedef struct ParallelHashJoinState
LWLock lock; /* lock protecting the above */
Barrier build_barrier; /* synchronization for the build phases */
+ Barrier eviction_barrier;
+ Barrier repartition_barrier;
Barrier grow_batches_barrier;
Barrier grow_buckets_barrier;
pg_atomic_uint32 distributor; /* counter for load balancing */
@@ -263,9 +332,42 @@ typedef struct ParallelHashJoinState
/* The phases for probing each batch, used by for batch_barrier. */
#define PHJ_BATCH_ELECTING 0
#define PHJ_BATCH_ALLOCATING 1
-#define PHJ_BATCH_LOADING 2
-#define PHJ_BATCH_PROBING 3
-#define PHJ_BATCH_DONE 4
+#define PHJ_BATCH_STRIPING 2
+#define PHJ_BATCH_DONE 3
+
+/* The phases for probing each stripe of each batch used with stripe barriers */
+#define PHJ_STRIPE_INVALID_PHASE -1
+#define PHJ_STRIPE_ELECTING 0
+#define PHJ_STRIPE_RESETTING 1
+#define PHJ_STRIPE_LOADING 2
+#define PHJ_STRIPE_OVERFLOWING 3
+#define PHJ_STRIPE_PROBING 4
+#define PHJ_STRIPE_DONE 5
+#define PHJ_STRIPE_NUMBER(n) ((n) / 6)
+#define PHJ_STRIPE_PHASE(n) ((n) % 6)
+
+#define PHJ_EVICT_ELECTING 0
+#define PHJ_EVICT_RESETTING 1
+#define PHJ_EVICT_SPILLING 2
+#define PHJ_EVICT_FINISHING 3
+#define PHJ_EVICT_DONE 4
+#define PHJ_EVICT_PHASE(n) ((n) % 5)
+
+/*
+ * These phases are now required for repartitioning batch 0 since it can
+ * spill. First all tuples which were resident in the hashtable need to
+ * be relocated either back to the hashtable or to a spill file, if they
+ * would relocate to a batch 1+ given the new number of batches. After
+ * draining the chunk_work_queue, we must drain the batch 0 spill file,
+ * if it exists. Some tuples may have been relocated from the hashtable
+ * to other batches, in which case, space may have been freed up which
+ * the tuples from the batch 0 spill file can occupy. The tuples from the
+ * batch 0 spill file may go to 1) the hashtable, 2) back to the batch 0
+ * spill file in the new generation of batches, 3) to a batch file 1+
+ */
+#define PHJ_REPARTITION_BATCH0_DRAIN_QUEUE 0
+#define PHJ_REPARTITION_BATCH0_DRAIN_SPILL_FILE 1
+#define PHJ_REPARTITION_BATCH0_PHASE(n) ((n) % 2)
/* The phases of batch growth while hashing, for grow_batches_barrier. */
#define PHJ_GROW_BATCHES_ELECTING 0
@@ -313,8 +415,6 @@ typedef struct HashJoinTableData
int nbatch_original; /* nbatch when we started inner scan */
int nbatch_outstart; /* nbatch when we started outer scan */
- bool growEnabled; /* flag to shut off nbatch increases */
-
double totalTuples; /* # tuples obtained from inner plan */
double partialTuples; /* # tuples obtained from inner plan by me */
double skewTuples; /* # tuples inserted into skew tuples */
@@ -329,6 +429,18 @@ typedef struct HashJoinTableData
BufFile **innerBatchFile; /* buffered virtual temp file per batch */
BufFile **outerBatchFile; /* buffered virtual temp file per batch */
+ /*
+ * Adaptive hashjoin variables
+ */
+ BufFile **hashloopBatchFile; /* outer match status files if fall back */
+ List *fallback_batches_stats; /* per hashjoin batch statistics */
+
+ /*
+ * current stripe #; 0 during 1st pass, -1 (macro STRIPE_DETACHED) when
+ * detached, -2 on phantom stripe (macro PHANTOM_STRIPE)
+ */
+ int curstripe;
+
/*
* Info about the datatype-specific hash functions for the datatypes being
* hashed. These are arrays of the same length as the number of hash join
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9dc3ecb07d79b..839086005c7bd 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -14,6 +14,7 @@
#define INSTRUMENT_H
#include "portability/instr_time.h"
+#include "nodes/pg_list.h"
typedef struct BufferUsage
@@ -39,6 +40,12 @@ typedef struct WalUsage
uint64 wal_bytes; /* size of WAL records produced */
} WalUsage;
+typedef struct FallbackBatchStats
+{
+ int batchno;
+ int numstripes;
+} FallbackBatchStats;
+
/* Flag bits included in InstrAlloc's instrument_options bitmask */
typedef enum InstrumentOption
{
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 2db4e2f67267b..6d094e1a43041 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -31,6 +31,7 @@ extern void ExecParallelHashTableAlloc(HashJoinTable hashtable,
extern void ExecHashTableDestroy(HashJoinTable hashtable);
extern void ExecHashTableDetach(HashJoinTable hashtable);
extern void ExecHashTableDetachBatch(HashJoinTable hashtable);
+extern bool ExecHashTableDetachStripe(HashJoinTable hashtable);
extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable,
int batchno);
@@ -40,9 +41,11 @@ extern void ExecHashTableInsert(HashJoinTable hashtable,
extern void ExecParallelHashTableInsert(HashJoinTable hashtable,
TupleTableSlot *slot,
uint32 hashvalue);
-extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
+extern MinimalTuple
+ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
TupleTableSlot *slot,
- uint32 hashvalue);
+ uint32 hashvalue,
+ int read_participant);
extern bool ExecHashGetHashValue(HashJoinTable hashtable,
ExprContext *econtext,
List *hashkeys,
@@ -59,6 +62,8 @@ extern void ExecPrepHashTableForUnmatched(HashJoinState *hjstate);
extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
ExprContext *econtext);
extern void ExecHashTableReset(HashJoinTable hashtable);
+extern void
+ ExecParallelHashTableRecycle(HashJoinTable hashtable);
extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
bool try_combined_hash_mem,
diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h
index f7df70b5abd58..0c0d87d1d3e36 100644
--- a/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@ -129,6 +129,7 @@ typedef struct TupleTableSlot
MemoryContext tts_mcxt; /* slot itself is in this context */
ItemPointerData tts_tid; /* stored tuple's tid */
Oid tts_tableOid; /* table oid of tuple */
+ uint32 tts_tuplenum; /* a tuple id for use when ctid cannot be used */
} TupleTableSlot;
/* routines for a TupleTableSlot implementation */
@@ -425,6 +426,7 @@ static inline TupleTableSlot *
ExecClearTuple(TupleTableSlot *slot)
{
slot->tts_ops->clear(slot);
+ slot->tts_tuplenum = 0; /* TODO: should this be done elsewhere? */
return slot;
}
diff --git a/src/include/jit/llvmjit_emit.h b/src/include/jit/llvmjit_emit.h
index 1a7d6db7259e0..3142df608b3c6 100644
--- a/src/include/jit/llvmjit_emit.h
+++ b/src/include/jit/llvmjit_emit.h
@@ -1,6 +1,6 @@
/*
* llvmjit_emit.h
- * Helpers to make emitting LLVM IR a it more concise and pgindent proof.
+ * Helpers to make emitting LLVM IR a bit more concise and pgindent proof.
*
* Copyright (c) 2018-2020, PostgreSQL Global Development Group
*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index cf832d7f90975..cb30e3bea1528 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -867,6 +867,8 @@ typedef struct SubPlanState
MemoryContext hashtablecxt; /* memory context containing hash tables */
MemoryContext hashtempcxt; /* temp memory context for hash tables */
ExprContext *innerecontext; /* econtext for computing inner tuples */
+ int numCols; /* number of columns being hashed */
+ /* each of the remaining fields is an array of length numCols: */
AttrNumber *keyColIdx; /* control data for hash tables */
Oid *tab_eq_funcoids; /* equality func oids for table
* datatype(s) */
@@ -1957,6 +1959,10 @@ typedef struct HashJoinState
int hj_JoinState;
bool hj_MatchedOuter;
bool hj_OuterNotEmpty;
+ /* Adaptive Hashjoin variables */
+ int hj_CurNumOuterTuples; /* number of outer tuples in a batch */
+ unsigned int hj_CurOuterMatchStatus;
+ int hj_EmitOuterTupleId;
} HashJoinState;
@@ -2385,6 +2391,7 @@ typedef struct HashInstrumentation
int nbatch; /* number of batches at end of execution */
int nbatch_original; /* planned number of batches */
Size space_peak; /* peak memory usage in bytes */
+ List *fallback_batches_stats; /* per hashjoin batch stats */
} HashInstrumentation;
/* ----------------
diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h
index 779906b9b77f9..9cc56eecaa3ac 100644
--- a/src/include/nodes/nodeFuncs.h
+++ b/src/include/nodes/nodeFuncs.h
@@ -36,6 +36,9 @@ typedef bool (*check_function_callback) (Oid func_id, void *context);
extern Oid exprType(const Node *expr);
extern int32 exprTypmod(const Node *expr);
extern bool exprIsLengthCoercion(const Node *expr, int32 *coercedTypmod);
+extern Node *applyRelabelType(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid,
+ CoercionForm rformat, int rlocation,
+ bool overwrite_ok);
extern Node *relabel_to_typmod(Node *expr, int32 typmod);
extern Node *strip_implicit_coercions(Node *node);
extern bool expression_returns_set(Node *clause);
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 151bcdb7ef5b9..47d4c07306d0a 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1786,6 +1786,7 @@ typedef enum AlterTableType
AT_AddColumnRecurse, /* internal to commands/tablecmds.c */
AT_AddColumnToView, /* implicitly via CREATE OR REPLACE VIEW */
AT_ColumnDefault, /* alter column default */
+ AT_CookedColumnDefault, /* add a pre-cooked column default */
AT_DropNotNull, /* alter column drop not null */
AT_SetNotNull, /* alter column set not null */
AT_DropExpression, /* alter column drop expression */
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index b7456e3e595bc..7ef8cce79eeca 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -38,6 +38,7 @@ extern bool contain_subplans(Node *clause);
extern char max_parallel_hazard(Query *parse);
extern bool is_parallel_safe(PlannerInfo *root, Node *node);
extern bool contain_nonstrict_functions(Node *clause);
+extern bool contain_exec_param(Node *clause, List *param_ids);
extern bool contain_leaked_vars(Node *clause);
extern Relids find_nonnullable_rels(Node *clause);
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index 1a5e0b83a7a5d..bc3d66ed88146 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -31,6 +31,8 @@ extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation parent,
PartitionBoundSpec *spec);
+extern List *expandTableLikeClause(RangeVar *heapRel,
+ TableLikeClause *table_like_clause);
extern IndexStmt *generateClonedIndexStmt(RangeVar *heapRel,
Relation source_idx,
const struct AttrMap *attmap,
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 13872013823ec..399c442171b59 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -855,11 +855,20 @@ typedef enum
WAIT_EVENT_EXECUTE_GATHER,
WAIT_EVENT_HASH_BATCH_ALLOCATE,
WAIT_EVENT_HASH_BATCH_ELECT,
- WAIT_EVENT_HASH_BATCH_LOAD,
+ WAIT_EVENT_HASH_STRIPE_ELECT,
+ WAIT_EVENT_HASH_STRIPE_RESET,
+ WAIT_EVENT_HASH_STRIPE_LOAD,
+ WAIT_EVENT_HASH_STRIPE_OVERFLOW,
+ WAIT_EVENT_HASH_STRIPE_PROBE,
WAIT_EVENT_HASH_BUILD_ALLOCATE,
WAIT_EVENT_HASH_BUILD_ELECT,
WAIT_EVENT_HASH_BUILD_HASH_INNER,
WAIT_EVENT_HASH_BUILD_HASH_OUTER,
+ WAIT_EVENT_HASH_EVICT_ELECT,
+ WAIT_EVENT_HASH_EVICT_RESET,
+ WAIT_EVENT_HASH_EVICT_SPILL,
+ WAIT_EVENT_HASH_EVICT_FINISH,
+ WAIT_EVENT_HASH_REPARTITION_BATCH0_DRAIN_QUEUE,
WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE,
WAIT_EVENT_HASH_GROW_BATCHES_DECIDE,
WAIT_EVENT_HASH_GROW_BATCHES_ELECT,
@@ -916,6 +925,7 @@ typedef enum
WAIT_EVENT_BASEBACKUP_READ = PG_WAIT_IO,
WAIT_EVENT_BUFFILE_READ,
WAIT_EVENT_BUFFILE_WRITE,
+ WAIT_EVENT_BUFFILE_TRUNCATE,
WAIT_EVENT_CONTROL_FILE_READ,
WAIT_EVENT_CONTROL_FILE_SYNC,
WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
diff --git a/src/include/port/cygwin.h b/src/include/port/cygwin.h
index f1fc1a93d76c0..64d69936e5e02 100644
--- a/src/include/port/cygwin.h
+++ b/src/include/port/cygwin.h
@@ -1,14 +1,5 @@
/* src/include/port/cygwin.h */
-#include
-
-/*
- * Check for b20.1 and disable AF_UNIX family socket support.
- */
-#if CYGWIN_VERSION_DLL_MAJOR < 1001
-#undef HAVE_UNIX_SOCKETS
-#endif
-
#ifdef BUILDING_DLL
#define PGDLLIMPORT __declspec (dllexport)
#else
diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h
index eeb1a320bd5b7..e63a3bd824d6d 100644
--- a/src/include/port/solaris.h
+++ b/src/include/port/solaris.h
@@ -24,15 +24,3 @@
#if defined(__i386__)
#include
#endif
-
-/*
- * Many versions of Solaris have broken strtod() --- see bug #4751182.
- * This has been fixed in current versions of Solaris:
- *
- * http://sunsolve.sun.com/search/document.do?assetkey=1-21-108993-62-1&searchclause=108993-62
- * http://sunsolve.sun.com/search/document.do?assetkey=1-21-112874-34-1&searchclause=112874-34
- *
- * However, many people might not have patched versions, so
- * still use our own fix for the buggy version.
- */
-#define HAVE_BUGGY_SOLARIS_STRTOD
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index f4752bab0da5a..fc34c49522dae 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -48,7 +48,9 @@ extern long BufFileAppend(BufFile *target, BufFile *source);
extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
extern void BufFileExportShared(BufFile *file);
-extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name);
+extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name,
+ int mode);
extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name);
+extern void BufFileTruncateShared(BufFile *file, int fileno, off_t offset);
#endif /* BUFFILE_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 8cd125d7dfaa6..e209f047e8533 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -94,7 +94,7 @@ extern mode_t FileGetRawMode(File file);
/* Operations used for sharing named temporary files */
extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
-extern File PathNameOpenTemporaryFile(const char *name);
+extern File PathNameOpenTemporaryFile(const char *path, int mode);
extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure);
extern void PathNameCreateTemporaryDir(const char *base, const char *name);
extern void PathNameDeleteTemporaryDir(const char *name);
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 3acc11aa5a3b1..f7cabcbbf550e 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -59,6 +59,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
LOCKMODE lockmode);
extern int RelationExtensionLockWaiterCount(Relation relation);
+/* Lock to recompute pg_database.datfrozenxid in the current database */
+extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
+
/* Lock a page (currently only used within indexes) */
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index fdabf427210ac..1c3e9c1999f56 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -138,6 +138,7 @@ typedef enum LockTagType
{
LOCKTAG_RELATION, /* whole relation */
LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */
+ LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */
LOCKTAG_PAGE, /* one page of a relation */
LOCKTAG_TUPLE, /* one physical tuple */
LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */
@@ -194,6 +195,15 @@ typedef struct LOCKTAG
(locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+/* ID info for frozen IDs is DB OID */
+#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = 0, \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
/* ID info for a page is RELATION info + BlockNumber */
#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
((locktag).locktag_field1 = (dboid), \
diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h
index 2d6cf077e51d9..d5edb600af966 100644
--- a/src/include/storage/sharedfileset.h
+++ b/src/include/storage/sharedfileset.h
@@ -37,9 +37,11 @@ typedef struct SharedFileSet
extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg);
extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg);
extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name);
-extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name);
+extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name,
+ int mode);
extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name,
bool error_on_failure);
extern void SharedFileSetDeleteAll(SharedFileSet *fileset);
+extern void SharedFileSetUnregister(SharedFileSet *input_fileset);
#endif
diff --git a/src/include/utils/sharedbits.h b/src/include/utils/sharedbits.h
new file mode 100644
index 0000000000000..de43279de8dc1
--- /dev/null
+++ b/src/include/utils/sharedbits.h
@@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedbits.h
+ * Simple mechanism for sharing bits between backends.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/sharedbits.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SHAREDBITS_H
+#define SHAREDBITS_H
+
+#include "storage/sharedfileset.h"
+
+struct SharedBits;
+typedef struct SharedBits SharedBits;
+
+struct SharedBitsParticipant;
+typedef struct SharedBitsParticipant SharedBitsParticipant;
+
+struct SharedBitsAccessor;
+typedef struct SharedBitsAccessor SharedBitsAccessor;
+
+extern SharedBitsAccessor *sb_attach(SharedBits *sbits, int my_participant_number, SharedFileSet *fileset);
+extern SharedBitsAccessor *sb_initialize(SharedBits *sbits, int participants, int my_participant_number, SharedFileSet *fileset, char *name);
+extern void sb_initialize_accessor(SharedBitsAccessor *accessor, uint32 nbits);
+extern size_t sb_estimate(int participants);
+
+extern void sb_setbit(SharedBitsAccessor *accessor, uint64 bit);
+extern bool sb_checkbit(SharedBitsAccessor *accessor, uint32 n);
+extern BufFile *sb_combine(SharedBitsAccessor *accessor);
+
+extern void sb_end_write(SharedBitsAccessor *sba);
+extern void sb_end_read(SharedBitsAccessor *accessor);
+
+#endif /* SHAREDBITS_H */
diff --git a/src/include/utils/sharedtuplestore.h b/src/include/utils/sharedtuplestore.h
index 9754504cc5367..5f8d95cb1a278 100644
--- a/src/include/utils/sharedtuplestore.h
+++ b/src/include/utils/sharedtuplestore.h
@@ -22,6 +22,17 @@ typedef struct SharedTuplestore SharedTuplestore;
struct SharedTuplestoreAccessor;
typedef struct SharedTuplestoreAccessor SharedTuplestoreAccessor;
+struct tupleMetadata;
+typedef struct tupleMetadata tupleMetadata;
+struct tupleMetadata
+{
+ uint32 hashvalue;
+ union
+ {
+ uint32 tupleid; /* tuple number or id on the outer side */
+ int stripe; /* stripe number for inner side */
+ };
+};
/*
* A flag indicating that the tuplestore will only be scanned once, so backing
@@ -58,4 +69,14 @@ extern void sts_puttuple(SharedTuplestoreAccessor *accessor,
extern MinimalTuple sts_parallel_scan_next(SharedTuplestoreAccessor *accessor,
void *meta_data);
+extern uint32 sts_increment_ntuples(SharedTuplestoreAccessor *accessor);
+extern uint32 sts_get_tuplenum(SharedTuplestoreAccessor *accessor);
+extern int sta_get_read_participant(SharedTuplestoreAccessor *accessor);
+extern void sts_spill_leftover_tuples(SharedTuplestoreAccessor *accessor, MinimalTuple tuple, uint32 hashvalue);
+
+extern MinimalTuple sts_parallel_scan_chunk(SharedTuplestoreAccessor *accessor,
+ void *meta_data,
+ bool inner);
+
+
#endif /* SHAREDTUPLESTORE_H */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 35b1f05bea659..dea072e5edf5e 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -207,6 +207,13 @@ typedef struct SnapshotData
TimestampTz whenTaken; /* timestamp when snapshot was taken */
XLogRecPtr lsn; /* position in the WAL stream when taken */
+
+ /*
+ * The transaction completion count at the time GetSnapshotData() built
+ * this snapshot. Allows to avoid re-computing static snapshots when no
+ * transactions completed since the last GetSnapshotData().
+ */
+ uint64 snapXactCompletionCount;
} SnapshotData;
#endif /* SNAPSHOT_H */
diff --git a/src/test/kerberos/README b/src/test/kerberos/README
index 93af72e163679..fa9c03e782915 100644
--- a/src/test/kerberos/README
+++ b/src/test/kerberos/README
@@ -8,10 +8,12 @@ functionality. This requires a full MIT Kerberos installation,
including server and client tools, and is therefore kept separate and
not run by default.
-Also, this test suite creates a KDC server that listens for TCP/IP
-connections on localhost without any real access control, so it is not
-safe to run this on a system where there might be untrusted local
-users.
+CAUTION: The test server run by this test is configured to listen for TCP
+connections on localhost. Any user on the same host is able to log in to the
+test server while the tests are running. Do not run this suite on a multi-user
+system where you don't trust all local users! Also, this test suite creates a
+KDC server that listens for TCP/IP connections on localhost without any real
+access control.
Running the tests
=================
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 1428529b041a5..a6d2ffbf9e0e5 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -10,6 +10,7 @@ SUBDIRS = \
delay_execution \
dummy_index_am \
dummy_seclabel \
+ plsample \
snapshot_too_old \
test_bloomfilter \
test_ddl_deparse \
diff --git a/src/test/modules/plsample/.gitignore b/src/test/modules/plsample/.gitignore
new file mode 100644
index 0000000000000..44d119cfcc241
--- /dev/null
+++ b/src/test/modules/plsample/.gitignore
@@ -0,0 +1,3 @@
+# Generated subdirectories
+/log/
+/results/
diff --git a/src/test/modules/plsample/Makefile b/src/test/modules/plsample/Makefile
new file mode 100644
index 0000000000000..f1bc334bfc87c
--- /dev/null
+++ b/src/test/modules/plsample/Makefile
@@ -0,0 +1,20 @@
+# src/test/modules/plsample/Makefile
+
+MODULES = plsample
+
+EXTENSION = plsample
+DATA = plsample--1.0.sql
+PGFILEDESC = "PL/Sample - template for procedural language"
+
+REGRESS = plsample
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/plsample
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/plsample/README b/src/test/modules/plsample/README
new file mode 100644
index 0000000000000..0ed319308d226
--- /dev/null
+++ b/src/test/modules/plsample/README
@@ -0,0 +1,6 @@
+PL/Sample
+=========
+
+PL/Sample is an example template of procedural-language handler. It is
+a simple implementation, yet demonstrates some of the things that can be done
+to build a fully functional procedural-language handler.
diff --git a/src/test/modules/plsample/expected/plsample.out b/src/test/modules/plsample/expected/plsample.out
new file mode 100644
index 0000000000000..a0c318b6df55f
--- /dev/null
+++ b/src/test/modules/plsample/expected/plsample.out
@@ -0,0 +1,36 @@
+CREATE EXTENSION plsample;
+-- Create and test some dummy functions
+CREATE FUNCTION plsample_result_text(a1 numeric, a2 text, a3 integer[])
+RETURNS TEXT
+AS $$
+ Example of source with text result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_text(1.23, 'abc', '{4, 5, 6}');
+NOTICE: source text of function "plsample_result_text":
+ Example of source with text result.
+
+NOTICE: argument: 0; name: a1; value: 1.23
+NOTICE: argument: 1; name: a2; value: abc
+NOTICE: argument: 2; name: a3; value: {4,5,6}
+ plsample_result_text
+---------------------------------------
+ +
+ Example of source with text result.+
+
+(1 row)
+
+CREATE FUNCTION plsample_result_void(a1 text[])
+RETURNS VOID
+AS $$
+ Example of source with void result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_void('{foo, bar, hoge}');
+NOTICE: source text of function "plsample_result_void":
+ Example of source with void result.
+
+NOTICE: argument: 0; name: a1; value: {foo,bar,hoge}
+ plsample_result_void
+----------------------
+
+(1 row)
+
diff --git a/src/test/modules/plsample/plsample--1.0.sql b/src/test/modules/plsample/plsample--1.0.sql
new file mode 100644
index 0000000000000..fc5b280bd4fa5
--- /dev/null
+++ b/src/test/modules/plsample/plsample--1.0.sql
@@ -0,0 +1,14 @@
+/* src/test/modules/plsample/plsample--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION plsample" to load this file. \quit
+
+CREATE FUNCTION plsample_call_handler() RETURNS language_handler
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE TRUSTED LANGUAGE plsample
+ HANDLER plsample_call_handler;
+
+ALTER LANGUAGE plsample OWNER TO @extowner@;
+
+COMMENT ON LANGUAGE plsample IS 'PL/Sample procedural language';
diff --git a/src/test/modules/plsample/plsample.c b/src/test/modules/plsample/plsample.c
new file mode 100644
index 0000000000000..80faef506b151
--- /dev/null
+++ b/src/test/modules/plsample/plsample.c
@@ -0,0 +1,187 @@
+/*-------------------------------------------------------------------------
+ *
+ * plsample.c
+ * Handler for the PL/Sample procedural language
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/test/modules/plsample/plsample.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "commands/event_trigger.h"
+#include "commands/trigger.h"
+#include "funcapi.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(plsample_call_handler);
+
+static Datum plsample_func_handler(PG_FUNCTION_ARGS);
+
+/*
+ * Handle function, procedure, and trigger calls.
+ */
+Datum
+plsample_call_handler(PG_FUNCTION_ARGS)
+{
+ Datum retval = (Datum) 0;
+
+ PG_TRY();
+ {
+ /*
+ * Determine if called as function or trigger and call appropriate
+ * subhandler.
+ */
+ if (CALLED_AS_TRIGGER(fcinfo))
+ {
+ /*
+ * This function has been called as a trigger function, where
+ * (TriggerData *) fcinfo->context includes the information of the
+ * context.
+ */
+ }
+ else if (CALLED_AS_EVENT_TRIGGER(fcinfo))
+ {
+ /*
+ * This function is called as an event trigger function, where
+ * (EventTriggerData *) fcinfo->context includes the information
+ * of the context.
+ */
+ }
+ else
+ {
+ /* Regular function handler */
+ retval = plsample_func_handler(fcinfo);
+ }
+ }
+ PG_FINALLY();
+ {
+ }
+ PG_END_TRY();
+
+ return retval;
+}
+
+/*
+ * plsample_func_handler
+ *
+ * Function called by the call handler for function execution.
+ */
+static Datum
+plsample_func_handler(PG_FUNCTION_ARGS)
+{
+ HeapTuple pl_tuple;
+ Datum ret;
+ char *source;
+ bool isnull;
+ FmgrInfo *arg_out_func;
+ Form_pg_type type_struct;
+ HeapTuple type_tuple;
+ Form_pg_proc pl_struct;
+ volatile MemoryContext proc_cxt = NULL;
+ Oid *argtypes;
+ char **argnames;
+ char *argmodes;
+ char *proname;
+ Form_pg_type pg_type_entry;
+ Oid result_typioparam;
+ Oid prorettype;
+ FmgrInfo result_in_func;
+ int numargs;
+
+ /* Fetch the source text of the function. */
+ pl_tuple = SearchSysCache(PROCOID,
+ ObjectIdGetDatum(fcinfo->flinfo->fn_oid), 0, 0, 0);
+ if (!HeapTupleIsValid(pl_tuple))
+ elog(ERROR, "cache lookup failed for function %u",
+ fcinfo->flinfo->fn_oid);
+
+ /*
+ * Extract and print the source text of the function. This can be used as
+ * a base for the function validation and execution.
+ */
+ pl_struct = (Form_pg_proc) GETSTRUCT(pl_tuple);
+ proname = pstrdup(NameStr(pl_struct->proname));
+ ret = SysCacheGetAttr(PROCOID, pl_tuple, Anum_pg_proc_prosrc, &isnull);
+ if (isnull)
+ elog(ERROR, "could not find source text of function \"%s\"",
+ proname);
+ source = DatumGetCString(DirectFunctionCall1(textout, ret));
+ ereport(NOTICE,
+ (errmsg("source text of function \"%s\": %s",
+ proname, source)));
+
+ /*
+ * Allocate a context that will hold all the Postgres data for the
+ * procedure.
+ */
+ proc_cxt = AllocSetContextCreate(TopMemoryContext,
+ "PL/Sample function",
+ ALLOCSET_SMALL_SIZES);
+
+ arg_out_func = (FmgrInfo *) palloc0(fcinfo->nargs * sizeof(FmgrInfo));
+ numargs = get_func_arg_info(pl_tuple, &argtypes, &argnames, &argmodes);
+
+ /*
+ * Iterate through all of the function arguments, printing each input
+ * value.
+ */
+ for (int i = 0; i < numargs; i++)
+ {
+ Oid argtype = pl_struct->proargtypes.values[i];
+ char *value;
+
+ type_tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(argtype));
+ if (!HeapTupleIsValid(type_tuple))
+ elog(ERROR, "cache lookup failed for type %u", argtype);
+
+ type_struct = (Form_pg_type) GETSTRUCT(type_tuple);
+ fmgr_info_cxt(type_struct->typoutput, &(arg_out_func[i]), proc_cxt);
+ ReleaseSysCache(type_tuple);
+
+ value = OutputFunctionCall(&arg_out_func[i], fcinfo->args[i].value);
+ ereport(NOTICE,
+ (errmsg("argument: %d; name: %s; value: %s",
+ i, argnames[i], value)));
+ }
+
+ /* Type of the result */
+ prorettype = pl_struct->prorettype;
+ ReleaseSysCache(pl_tuple);
+
+ /*
+ * Get the required information for input conversion of the return value.
+ *
+ * If the function uses VOID as result, it is better to return NULL.
+ * Anyway, let's be honest. This is just a template, so there is not much
+ * we can do here. This returns NULL except if the result type is text,
+ * where the result is the source text of the function.
+ */
+ if (prorettype != TEXTOID)
+ PG_RETURN_NULL();
+
+ type_tuple = SearchSysCache1(TYPEOID,
+ ObjectIdGetDatum(prorettype));
+ if (!HeapTupleIsValid(type_tuple))
+ elog(ERROR, "cache lookup failed for type %u", prorettype);
+ pg_type_entry = (Form_pg_type) GETSTRUCT(type_tuple);
+ result_typioparam = getTypeIOParam(type_tuple);
+
+ fmgr_info_cxt(pg_type_entry->typinput, &result_in_func, proc_cxt);
+ ReleaseSysCache(type_tuple);
+
+ ret = InputFunctionCall(&result_in_func, source, result_typioparam, -1);
+ PG_RETURN_DATUM(ret);
+}
diff --git a/src/test/modules/plsample/plsample.control b/src/test/modules/plsample/plsample.control
new file mode 100644
index 0000000000000..1e67251a1e03e
--- /dev/null
+++ b/src/test/modules/plsample/plsample.control
@@ -0,0 +1,8 @@
+# plsample extension
+comment = 'PL/Sample'
+default_version = '1.0'
+module_pathname = '$libdir/plsample'
+relocatable = false
+schema = pg_catalog
+superuser = false
+trusted = true
diff --git a/src/test/modules/plsample/sql/plsample.sql b/src/test/modules/plsample/sql/plsample.sql
new file mode 100644
index 0000000000000..bf0fddac7fc8e
--- /dev/null
+++ b/src/test/modules/plsample/sql/plsample.sql
@@ -0,0 +1,15 @@
+CREATE EXTENSION plsample;
+-- Create and test some dummy functions
+CREATE FUNCTION plsample_result_text(a1 numeric, a2 text, a3 integer[])
+RETURNS TEXT
+AS $$
+ Example of source with text result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_text(1.23, 'abc', '{4, 5, 6}');
+
+CREATE FUNCTION plsample_result_void(a1 text[])
+RETURNS VOID
+AS $$
+ Example of source with void result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_void('{foo, bar, hoge}');
diff --git a/src/test/modules/test_ddl_deparse/expected/create_table.out b/src/test/modules/test_ddl_deparse/expected/create_table.out
index c7c9bf8971f37..0f2a2c164eb56 100644
--- a/src/test/modules/test_ddl_deparse/expected/create_table.out
+++ b/src/test/modules/test_ddl_deparse/expected/create_table.out
@@ -135,6 +135,8 @@ CREATE TABLE like_fkey_table (
INCLUDING STORAGE
);
NOTICE: DDL test: type simple, tag CREATE TABLE
+NOTICE: DDL test: type alter table, tag ALTER TABLE
+NOTICE: subcommand: ALTER COLUMN SET DEFAULT (precooked)
NOTICE: DDL test: type simple, tag CREATE INDEX
NOTICE: DDL test: type simple, tag CREATE INDEX
-- Volatile table types
diff --git a/src/test/modules/test_ddl_deparse/test_ddl_deparse.c b/src/test/modules/test_ddl_deparse/test_ddl_deparse.c
index b7bdb88ce7f7c..def4e39f19deb 100644
--- a/src/test/modules/test_ddl_deparse/test_ddl_deparse.c
+++ b/src/test/modules/test_ddl_deparse/test_ddl_deparse.c
@@ -111,6 +111,9 @@ get_altertable_subcmdtypes(PG_FUNCTION_ARGS)
case AT_ColumnDefault:
strtype = "ALTER COLUMN SET DEFAULT";
break;
+ case AT_CookedColumnDefault:
+ strtype = "ALTER COLUMN SET DEFAULT (precooked)";
+ break;
case AT_DropNotNull:
strtype = "DROP NOT NULL";
break;
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 8c1b77376fb08..1488bffa2ba34 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1234,10 +1234,8 @@ sub can_bind
return $ret;
}
-# Automatically shut down any still-running nodes when the test script exits.
-# Note that this just stops the postmasters (in the same order the nodes were
-# created in). Any temporary directories are deleted, in an unspecified
-# order, later when the File::Temp objects are destroyed.
+# Automatically shut down any still-running nodes (in the same order the nodes
+# were created in) when the test script exits.
END
{
diff --git a/src/test/recovery/t/010_logical_decoding_timelines.pl b/src/test/recovery/t/010_logical_decoding_timelines.pl
index 09aaefa9f032e..329500f0ae5b7 100644
--- a/src/test/recovery/t/010_logical_decoding_timelines.pl
+++ b/src/test/recovery/t/010_logical_decoding_timelines.pl
@@ -111,7 +111,7 @@
# Examine the physical slot the replica uses to stream changes
# from the primary to make sure its hot_standby_feedback
# has locked in a catalog_xmin on the physical slot, and that
-# any xmin is < the catalog_xmin
+# any xmin is >= the catalog_xmin
$node_primary->poll_query_until(
'postgres', q[
SELECT catalog_xmin IS NOT NULL
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 6f90eae2f8ce9..f56615393ec32 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3678,6 +3678,42 @@ ALTER TABLE ataddindex
Indexes:
"ataddindex_expr_excl" EXCLUDE USING btree ((f1 ~~ 'a'::text) WITH =)
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+ ADD PRIMARY KEY (id),
+ ADD FOREIGN KEY (ref_id) REFERENCES ataddindex;
+\d ataddindex
+ Table "public.ataddindex"
+ Column | Type | Collation | Nullable | Default
+--------+---------+-----------+----------+---------
+ id | integer | | not null |
+ ref_id | integer | | |
+Indexes:
+ "ataddindex_pkey" PRIMARY KEY, btree (id)
+Foreign-key constraints:
+ "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+Referenced by:
+ TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+ ADD UNIQUE (id),
+ ADD FOREIGN KEY (ref_id) REFERENCES ataddindex (id);
+\d ataddindex
+ Table "public.ataddindex"
+ Column | Type | Collation | Nullable | Default
+--------+---------+-----------+----------+---------
+ id | integer | | |
+ ref_id | integer | | |
+Indexes:
+ "ataddindex_id_key" UNIQUE CONSTRAINT, btree (id)
+Foreign-key constraints:
+ "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+Referenced by:
+ TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+
DROP TABLE ataddindex;
-- unsupported constraint types for partitioned tables
CREATE TABLE partitioned (
diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index 0b14c73fc6456..18403498dfab6 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea,
int4rangecol int4range,
lsncol pg_lsn,
boxcol box
-) WITH (fillfactor=10);
+) WITH (fillfactor=10, autovacuum_enabled=off);
INSERT INTO brintest SELECT
repeat(stringu1, 8)::bytea,
substr(stringu1, 1, 1)::"char",
diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out
index 655e8e41dd903..e3edbd8b511cd 100644
--- a/src/test/regress/expected/create_table_like.out
+++ b/src/test/regress/expected/create_table_like.out
@@ -160,7 +160,9 @@ SELECT * FROM test_like_gen_3;
DROP TABLE test_like_gen_1, test_like_gen_2, test_like_gen_3;
-- also test generated column with a "forward" reference (bug #16342)
-CREATE TABLE test_like_4 (b int DEFAULT 42, c int GENERATED ALWAYS AS (a * 2) STORED, a int);
+CREATE TABLE test_like_4 (b int DEFAULT 42,
+ c int GENERATED ALWAYS AS (a * 2) STORED,
+ a int CHECK (a > 0));
\d test_like_4
Table "public.test_like_4"
Column | Type | Collation | Nullable | Default
@@ -168,6 +170,8 @@ CREATE TABLE test_like_4 (b int DEFAULT 42, c int GENERATED ALWAYS AS (a * 2) ST
b | integer | | | 42
c | integer | | | generated always as (a * 2) stored
a | integer | | |
+Check constraints:
+ "test_like_4_a_check" CHECK (a > 0)
CREATE TABLE test_like_4a (LIKE test_like_4);
CREATE TABLE test_like_4b (LIKE test_like_4 INCLUDING DEFAULTS);
@@ -233,7 +237,32 @@ SELECT a, b, c FROM test_like_4d;
11 | 42 | 22
(1 row)
+-- Test renumbering of Vars when combining LIKE with inheritance
+CREATE TABLE test_like_5 (x point, y point, z point);
+CREATE TABLE test_like_5x (p int CHECK (p > 0),
+ q int GENERATED ALWAYS AS (p * 2) STORED);
+CREATE TABLE test_like_5c (LIKE test_like_4 INCLUDING ALL)
+ INHERITS (test_like_5, test_like_5x);
+\d test_like_5c
+ Table "public.test_like_5c"
+ Column | Type | Collation | Nullable | Default
+--------+---------+-----------+----------+------------------------------------
+ x | point | | |
+ y | point | | |
+ z | point | | |
+ p | integer | | |
+ q | integer | | | generated always as (p * 2) stored
+ b | integer | | | 42
+ c | integer | | | generated always as (a * 2) stored
+ a | integer | | |
+Check constraints:
+ "test_like_4_a_check" CHECK (a > 0)
+ "test_like_5x_p_check" CHECK (p > 0)
+Inherits: test_like_5,
+ test_like_5x
+
DROP TABLE test_like_4, test_like_4a, test_like_4b, test_like_4c, test_like_4d;
+DROP TABLE test_like_5, test_like_5x, test_like_5c;
CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
INSERT INTO inhg VALUES (5, 10);
INSERT INTO inhg VALUES (20, 10); -- should fail
@@ -269,9 +298,10 @@ ALTER TABLE ctlt1 ALTER COLUMN a SET STORAGE MAIN;
CREATE TABLE ctlt2 (c text);
ALTER TABLE ctlt2 ALTER COLUMN c SET STORAGE EXTERNAL;
COMMENT ON COLUMN ctlt2.c IS 'C';
-CREATE TABLE ctlt3 (a text CHECK (length(a) < 5), c text);
+CREATE TABLE ctlt3 (a text CHECK (length(a) < 5), c text CHECK (length(c) < 7));
ALTER TABLE ctlt3 ALTER COLUMN c SET STORAGE EXTERNAL;
ALTER TABLE ctlt3 ALTER COLUMN a SET STORAGE MAIN;
+CREATE INDEX ctlt3_fnidx ON ctlt3 ((a || c));
COMMENT ON COLUMN ctlt3.a IS 'A3';
COMMENT ON COLUMN ctlt3.c IS 'C';
COMMENT ON CONSTRAINT ctlt3_a_check ON ctlt3 IS 't3_a_check';
@@ -327,10 +357,11 @@ NOTICE: merging multiple inherited definitions of column "a"
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
"ctlt3_a_check" CHECK (length(a) < 5)
+ "ctlt3_c_check" CHECK (length(c) < 7)
Inherits: ctlt1,
ctlt3
-CREATE TABLE ctlt13_like (LIKE ctlt3 INCLUDING CONSTRAINTS INCLUDING COMMENTS INCLUDING STORAGE) INHERITS (ctlt1);
+CREATE TABLE ctlt13_like (LIKE ctlt3 INCLUDING CONSTRAINTS INCLUDING INDEXES INCLUDING COMMENTS INCLUDING STORAGE) INHERITS (ctlt1);
NOTICE: merging column "a" with inherited definition
\d+ ctlt13_like
Table "public.ctlt13_like"
@@ -339,9 +370,12 @@ NOTICE: merging column "a" with inherited definition
a | text | | not null | | main | | A3
b | text | | | | extended | |
c | text | | | | external | | C
+Indexes:
+ "ctlt13_like_expr_idx" btree ((a || c))
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
"ctlt3_a_check" CHECK (length(a) < 5)
+ "ctlt3_c_check" CHECK (length(c) < 7)
Inherits: ctlt1
SELECT description FROM pg_description, pg_constraint c WHERE classoid = 'pg_constraint'::regclass AND objoid = c.oid AND c.conrelid = 'ctlt13_like'::regclass;
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 96baba038c2e2..dc7ab2ce8bfb5 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -23,6 +23,9 @@ begin
-- Ignore text-mode buffers output because it varies depending
-- on the system state
CONTINUE WHEN (ln ~ ' +Buffers: .*');
+ -- Ignore text-mode "Planning:" line because whether it's output
+ -- varies depending on the system state
+ CONTINUE WHEN (ln = 'Planning:');
return next ln;
end loop;
end;
@@ -106,7 +109,6 @@ select explain_filter('explain (analyze, buffers, format json) select * from int
"Temp Written Blocks": N +
}, +
"Planning": { +
- "Planning Time": N.N, +
"Shared Hit Blocks": N, +
"Shared Read Blocks": N, +
"Shared Dirtied Blocks": N, +
@@ -118,6 +120,7 @@ select explain_filter('explain (analyze, buffers, format json) select * from int
"Temp Read Blocks": N, +
"Temp Written Blocks": N +
}, +
+ "Planning Time": N.N, +
"Triggers": [ +
], +
"Execution Time": N.N +
@@ -155,7 +158,6 @@ select explain_filter('explain (analyze, buffers, format xml) select * from int8
N +
+
+
- N.N +
N +
N +
N+
@@ -167,6 +169,7 @@ select explain_filter('explain (analyze, buffers, format xml) select * from int8
N +
N +
+
+ N.N +
+
+
N.N +
@@ -201,7 +204,6 @@ select explain_filter('explain (analyze, buffers, format yaml) select * from int
Temp Read Blocks: N +
Temp Written Blocks: N +
Planning: +
- Planning Time: N.N +
Shared Hit Blocks: N +
Shared Read Blocks: N +
Shared Dirtied Blocks: N +
@@ -212,10 +214,58 @@ select explain_filter('explain (analyze, buffers, format yaml) select * from int
Local Written Blocks: N +
Temp Read Blocks: N +
Temp Written Blocks: N +
+ Planning Time: N.N +
Triggers: +
Execution Time: N.N
(1 row)
+select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
+ explain_filter
+---------------------------------------------------------
+ Seq Scan on int8_tbl i8 (cost=N.N..N.N rows=N width=N)
+(1 row)
+
+select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
+ explain_filter
+------------------------------------
+ [ +
+ { +
+ "Plan": { +
+ "Node Type": "Seq Scan", +
+ "Parallel Aware": false, +
+ "Relation Name": "int8_tbl",+
+ "Alias": "i8", +
+ "Startup Cost": N.N, +
+ "Total Cost": N.N, +
+ "Plan Rows": N, +
+ "Plan Width": N, +
+ "Shared Hit Blocks": N, +
+ "Shared Read Blocks": N, +
+ "Shared Dirtied Blocks": N, +
+ "Shared Written Blocks": N, +
+ "Local Hit Blocks": N, +
+ "Local Read Blocks": N, +
+ "Local Dirtied Blocks": N, +
+ "Local Written Blocks": N, +
+ "Temp Read Blocks": N, +
+ "Temp Written Blocks": N +
+ }, +
+ "Planning": { +
+ "Shared Hit Blocks": N, +
+ "Shared Read Blocks": N, +
+ "Shared Dirtied Blocks": N, +
+ "Shared Written Blocks": N, +
+ "Local Hit Blocks": N, +
+ "Local Read Blocks": N, +
+ "Local Dirtied Blocks": N, +
+ "Local Written Blocks": N, +
+ "Temp Read Blocks": N, +
+ "Temp Written Blocks": N +
+ } +
+ } +
+ ]
+(1 row)
+
-- SETTINGS option
-- We have to ignore other settings that might be imposed by the environment,
-- so printing the whole Settings field unfortunately won't do.
@@ -402,7 +452,6 @@ select jsonb_pretty(
"Shared Written Blocks": 0 +
}, +
"Planning": { +
- "Planning Time": 0.0, +
"Local Hit Blocks": 0, +
"Temp Read Blocks": 0, +
"Local Read Blocks": 0, +
@@ -416,6 +465,7 @@ select jsonb_pretty(
}, +
"Triggers": [ +
], +
+ "Planning Time": 0.0, +
"Execution Time": 0.0 +
} +
]
diff --git a/src/test/regress/expected/gin.out b/src/test/regress/expected/gin.out
index 83de5220fb9ce..b335466fc4bae 100644
--- a/src/test/regress/expected/gin.out
+++ b/src/test/regress/expected/gin.out
@@ -199,6 +199,71 @@ from
i @> '{1}' and j @> '{10}' | 2 | 0 | t
(10 rows)
+reset enable_seqscan;
+reset enable_bitmapscan;
+-- re-purpose t_gin_test_tbl to test scans involving posting trees
+insert into t_gin_test_tbl select array[1, g, g/10], array[2, g, g/10]
+ from generate_series(1, 20000) g;
+select gin_clean_pending_list('t_gin_test_tbl_i_j_idx') is not null;
+ ?column?
+----------
+ t
+(1 row)
+
+analyze t_gin_test_tbl;
+set enable_seqscan = off;
+set enable_bitmapscan = on;
+explain (costs off)
+select count(*) from t_gin_test_tbl where j @> array[50];
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on t_gin_test_tbl
+ Recheck Cond: (j @> '{50}'::integer[])
+ -> Bitmap Index Scan on t_gin_test_tbl_i_j_idx
+ Index Cond: (j @> '{50}'::integer[])
+(5 rows)
+
+select count(*) from t_gin_test_tbl where j @> array[50];
+ count
+-------
+ 11
+(1 row)
+
+explain (costs off)
+select count(*) from t_gin_test_tbl where j @> array[2];
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on t_gin_test_tbl
+ Recheck Cond: (j @> '{2}'::integer[])
+ -> Bitmap Index Scan on t_gin_test_tbl_i_j_idx
+ Index Cond: (j @> '{2}'::integer[])
+(5 rows)
+
+select count(*) from t_gin_test_tbl where j @> array[2];
+ count
+-------
+ 20000
+(1 row)
+
+explain (costs off)
+select count(*) from t_gin_test_tbl where j @> '{}'::int[];
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on t_gin_test_tbl
+ Recheck Cond: (j @> '{}'::integer[])
+ -> Bitmap Index Scan on t_gin_test_tbl_i_j_idx
+ Index Cond: (j @> '{}'::integer[])
+(5 rows)
+
+select count(*) from t_gin_test_tbl where j @> '{}'::int[];
+ count
+-------
+ 20006
+(1 row)
+
reset enable_seqscan;
reset enable_bitmapscan;
drop table t_gin_test_tbl;
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index 03ada654bb572..701d52b465d5a 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -434,6 +434,38 @@ select x, not x as not_x, q2 from
| | 4567890123456789
(5 rows)
+-- check qual push-down rules for a subquery with grouping sets
+explain (verbose, costs off)
+select * from (
+ select 1 as x, q1, sum(q2)
+ from int8_tbl i1
+ group by grouping sets(1, 2)
+) ss
+where x = 1 and q1 = 123;
+ QUERY PLAN
+--------------------------------------------
+ Subquery Scan on ss
+ Output: ss.x, ss.q1, ss.sum
+ Filter: ((ss.x = 1) AND (ss.q1 = 123))
+ -> GroupAggregate
+ Output: (1), i1.q1, sum(i1.q2)
+ Group Key: 1
+ Sort Key: i1.q1
+ Group Key: i1.q1
+ -> Seq Scan on public.int8_tbl i1
+ Output: 1, i1.q1, i1.q2
+(10 rows)
+
+select * from (
+ select 1 as x, q1, sum(q2)
+ from int8_tbl i1
+ group by grouping sets(1, 2)
+) ss
+where x = 1 and q1 = 123;
+ x | q1 | sum
+---+----+-----
+(0 rows)
+
-- simple rescan tests
select a, b, sum(v.x)
from (values (1),(2)) v(x), gstest_data(v.x)
diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out
index 3a91c144a27fc..aa7477a29980d 100644
--- a/src/test/regress/expected/join_hash.out
+++ b/src/test/regress/expected/join_hash.out
@@ -839,45 +839,26 @@ rollback to settings;
-- the hash table)
-- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and
-- sts_puttuple oversized tuple cases because it's multi-batch)
-savepoint settings;
-set max_parallel_workers_per_gather = 2;
-set enable_parallel_hash = on;
-set work_mem = '128kB';
-explain (costs off)
- select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
- QUERY PLAN
-----------------------------------------------------------------
- Finalize Aggregate
- -> Gather
- Workers Planned: 2
- -> Partial Aggregate
- -> Parallel Hash Left Join
- Hash Cond: (wide.id = wide_1.id)
- -> Parallel Seq Scan on wide
- -> Parallel Hash
- -> Parallel Seq Scan on wide wide_1
-(9 rows)
-
-select length(max(s.t))
-from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
- length
---------
- 320000
-(1 row)
-
-select final > 1 as multibatch
- from hash_join_batches(
-$$
- select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
-$$);
- multibatch
-------------
- t
-(1 row)
-
-rollback to settings;
+-- savepoint settings;
+-- set max_parallel_workers_per_gather = 2;
+-- set enable_parallel_hash = on;
+-- TODO: throw an error when this happens: cannot set work_mem lower than the side of a single tuple
+-- TODO: ensure that oversize tuple code is still exercised (should be with some of the stub stuff below)
+-- TODO: commented this out since it would crash otherwise
+-- this test is no longer multi-batch, so, perhaps, it should be removed
+-- set work_mem = '128kB';
+-- explain (costs off)
+-- select length(max(s.t))
+-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+-- select length(max(s.t))
+-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+-- select final > 1 as multibatch
+-- from hash_join_batches(
+-- $$
+-- select length(max(s.t))
+-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+-- $$);
+-- rollback to settings;
rollback;
-- Verify that hash key expressions reference the correct
-- nodes. Hashjoin's hashkeys need to reference its outer plan, Hash's
@@ -1013,3 +994,1968 @@ WHERE
(1 row)
ROLLBACK;
+-- Serial Adaptive Hash Join
+BEGIN;
+CREATE TYPE stub AS (hash INTEGER, value CHAR(8090));
+CREATE FUNCTION stub_hash(item stub)
+RETURNS INTEGER AS $$
+DECLARE
+ batch_size INTEGER;
+BEGIN
+ batch_size := 4;
+ RETURN item.hash << (batch_size - 1);
+END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE;
+CREATE FUNCTION stub_eq(item1 stub, item2 stub)
+RETURNS BOOLEAN AS $$
+BEGIN
+ RETURN item1.hash = item2.hash AND item1.value = item2.value;
+END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE;
+CREATE OPERATOR = (
+ FUNCTION = stub_eq,
+ LEFTARG = stub,
+ RIGHTARG = stub,
+ COMMUTATOR = =,
+ HASHES, MERGES
+);
+CREATE OPERATOR CLASS stub_hash_ops
+DEFAULT FOR TYPE stub USING hash AS
+ OPERATOR 1 =(stub, stub),
+ FUNCTION 1 stub_hash(stub);
+CREATE TABLE probeside(a stub);
+ALTER TABLE probeside ALTER COLUMN a SET STORAGE PLAIN;
+-- non-fallback batch with unmatched outer tuple
+INSERT INTO probeside SELECT '(2, "")' FROM generate_series(1, 1);
+-- fallback batch unmatched outer tuple (in first stripe maybe)
+INSERT INTO probeside SELECT '(1, "unmatched outer tuple")' FROM generate_series(1, 1);
+-- fallback batch matched outer tuple
+INSERT INTO probeside SELECT '(1, "")' FROM generate_series(1, 5);
+-- fallback batch unmatched outer tuple (in last stripe maybe)
+-- When numbatches=4, hash 5 maps to batch 1, but after numbatches doubles to
+-- 8 batches hash 5 maps to batch 5.
+INSERT INTO probeside SELECT '(5, "")' FROM generate_series(1, 1);
+-- non-fallback batch matched outer tuple
+INSERT INTO probeside SELECT '(3, "")' FROM generate_series(1, 1);
+-- batch with 3 stripes where non-first/non-last stripe contains unmatched outer tuple
+INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 5);
+INSERT INTO probeside SELECT '(6, "unmatched outer tuple")' FROM generate_series(1, 1);
+INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 1);
+CREATE TABLE hashside_wide(a stub, id int);
+ALTER TABLE hashside_wide ALTER COLUMN a SET STORAGE PLAIN;
+-- falls back with an unmatched inner tuple that is in fist, middle, and last
+-- stripe
+INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in first stripe")', 1 FROM generate_series(1, 1);
+INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9);
+INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in middle stripe")', 1 FROM generate_series(1, 1);
+INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9);
+INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in last stripe")', 1 FROM generate_series(1, 1);
+-- doesn't fall back -- matched tuple
+INSERT INTO hashside_wide SELECT '(3, "")', 3 FROM generate_series(1, 1);
+INSERT INTO hashside_wide SELECT '(6, "")', 6 FROM generate_series(1, 20);
+ANALYZE probeside, hashside_wide;
+SET enable_nestloop TO off;
+SET enable_mergejoin TO off;
+SET work_mem = 64;
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+ hash | btrim | id | hash | btrim
+------+-----------------------+----+------+-------
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | unmatched outer tuple | | |
+ 2 | | | |
+ 3 | | 3 | 3 |
+ 5 | | | |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | unmatched outer tuple | | |
+(215 rows)
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a);
+ QUERY PLAN
+----------------------------------------------------------------
+ Hash Left Join (actual rows=215 loops=1)
+ Hash Cond: (probeside.a = hashside_wide.a)
+ -> Seq Scan on probeside (actual rows=16 loops=1)
+ -> Hash (actual rows=42 loops=1)
+ Buckets: 8 (originally 8) Batches: 32 (originally 8)
+ Batch: 1 Stripes: 3
+ Batch: 6 Stripes: 3
+ -> Seq Scan on hashside_wide (actual rows=42 loops=1)
+(8 rows)
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+RIGHT OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+ hash | btrim | id | hash | btrim
+------+-------+----+------+----------------------------------------
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 3 | | 3 | 3 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ | | 1 | 1 | unmatched inner tuple in first stripe
+ | | 1 | 1 | unmatched inner tuple in last stripe
+ | | 1 | 1 | unmatched inner tuple in middle stripe
+(214 rows)
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+RIGHT OUTER JOIN hashside_wide USING (a);
+ QUERY PLAN
+----------------------------------------------------------------
+ Hash Right Join (actual rows=214 loops=1)
+ Hash Cond: (probeside.a = hashside_wide.a)
+ -> Seq Scan on probeside (actual rows=16 loops=1)
+ -> Hash (actual rows=42 loops=1)
+ Buckets: 8 (originally 8) Batches: 32 (originally 8)
+ Batch: 1 Stripes: 3
+ Batch: 6 Stripes: 3
+ -> Seq Scan on hashside_wide (actual rows=42 loops=1)
+(8 rows)
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+FULL OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+ hash | btrim | id | hash | btrim
+------+-----------------------+----+------+----------------------------------------
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | unmatched outer tuple | | |
+ 2 | | | |
+ 3 | | 3 | 3 |
+ 5 | | | |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | unmatched outer tuple | | |
+ | | 1 | 1 | unmatched inner tuple in first stripe
+ | | 1 | 1 | unmatched inner tuple in last stripe
+ | | 1 | 1 | unmatched inner tuple in middle stripe
+(218 rows)
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+FULL OUTER JOIN hashside_wide USING (a);
+ QUERY PLAN
+----------------------------------------------------------------
+ Hash Full Join (actual rows=218 loops=1)
+ Hash Cond: (probeside.a = hashside_wide.a)
+ -> Seq Scan on probeside (actual rows=16 loops=1)
+ -> Hash (actual rows=42 loops=1)
+ Buckets: 8 (originally 8) Batches: 32 (originally 8)
+ Batch: 1 Stripes: 3
+ Batch: 6 Stripes: 3
+ -> Seq Scan on hashside_wide (actual rows=42 loops=1)
+(8 rows)
+
+-- semi-join testcase
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off)
+SELECT probeside.* FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a);
+ QUERY PLAN
+----------------------------------------------------------------
+ Hash Semi Join (actual rows=12 loops=1)
+ Hash Cond: (probeside.a = hashside_wide.a)
+ -> Seq Scan on probeside (actual rows=16 loops=1)
+ -> Hash (actual rows=42 loops=1)
+ Buckets: 8 (originally 8) Batches: 32 (originally 8)
+ Batch: 1 Stripes: 3
+ Batch: 6 Stripes: 3
+ -> Seq Scan on hashside_wide (actual rows=42 loops=1)
+(8 rows)
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value)
+FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2;
+ hash | btrim
+------+-------
+ 1 |
+ 1 |
+ 1 |
+ 1 |
+ 1 |
+ 3 |
+ 6 |
+ 6 |
+ 6 |
+ 6 |
+ 6 |
+ 6 |
+(12 rows)
+
+-- anti-join testcase
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off)
+SELECT probeside.* FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a);
+ QUERY PLAN
+----------------------------------------------------------------
+ Hash Anti Join (actual rows=4 loops=1)
+ Hash Cond: (probeside.a = hashside_wide.a)
+ -> Seq Scan on probeside (actual rows=16 loops=1)
+ -> Hash (actual rows=42 loops=1)
+ Buckets: 8 (originally 8) Batches: 32 (originally 8)
+ Batch: 1 Stripes: 3
+ Batch: 6 Stripes: 3
+ -> Seq Scan on hashside_wide (actual rows=42 loops=1)
+(8 rows)
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value)
+FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2;
+ hash | btrim
+------+-----------------------
+ 1 | unmatched outer tuple
+ 2 |
+ 5 |
+ 6 | unmatched outer tuple
+(4 rows)
+
+-- parallel LOJ test case with two batches falling back
+savepoint settings;
+set local max_parallel_workers_per_gather = 1;
+set local min_parallel_table_scan_size = 0;
+set local parallel_setup_cost = 0;
+set local enable_parallel_hash = on;
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a);
+ QUERY PLAN
+-------------------------------------------------------------------------------
+ Gather (actual rows=215 loops=1)
+ Workers Planned: 1
+ Workers Launched: 1
+ -> Parallel Hash Left Join (actual rows=108 loops=2)
+ Hash Cond: (probeside.a = hashside_wide.a)
+ -> Parallel Seq Scan on probeside (actual rows=16 loops=1)
+ -> Parallel Hash (actual rows=21 loops=2)
+ Buckets: 8 (originally 8) Batches: 128 (originally 8)
+ Batch: 1 Stripes: 3
+ Batch: 6 Stripes: 3
+ -> Parallel Seq Scan on hashside_wide (actual rows=42 loops=1)
+(11 rows)
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+ hash | btrim | id | hash | btrim
+------+-----------------------+----+------+-------
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | | 1 | 1 |
+ 1 | unmatched outer tuple | | |
+ 2 | | | |
+ 3 | | 3 | 3 |
+ 5 | | | |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | | 6 | 6 |
+ 6 | unmatched outer tuple | | |
+(215 rows)
+
+rollback to settings;
+-- Test spill of batch 0 gives correct results.
+CREATE TABLE probeside_batch0(id int generated always as identity, a stub);
+ALTER TABLE probeside_batch0 ALTER COLUMN a SET STORAGE PLAIN;
+INSERT INTO probeside_batch0(a) SELECT '(0, "")' FROM generate_series(1, 13);
+INSERT INTO probeside_batch0(a) SELECT '(0, "unmatched outer")' FROM generate_series(1, 1);
+CREATE TABLE hashside_wide_batch0(id int generated always as identity, a stub);
+ALTER TABLE hashside_wide_batch0 ALTER COLUMN a SET STORAGE PLAIN;
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+ANALYZE probeside_batch0, hashside_wide_batch0;
+SELECT
+ hashside_wide_batch0.id as hashside_id,
+ (hashside_wide_batch0.a).hash as hashside_hash,
+ probeside_batch0.id as probeside_id,
+ (probeside_batch0.a).hash as probeside_hash,
+ TRIM((probeside_batch0.a).value) as probeside_trimmed_value,
+ TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value
+FROM probeside_batch0
+LEFT OUTER JOIN hashside_wide_batch0 USING (a)
+ORDER BY 1, 2, 3, 4, 5, 6;
+ hashside_id | hashside_hash | probeside_id | probeside_hash | probeside_trimmed_value | hashside_trimmed_value
+-------------+---------------+--------------+----------------+-------------------------+------------------------
+ 1 | 0 | 1 | 0 | |
+ 1 | 0 | 2 | 0 | |
+ 1 | 0 | 3 | 0 | |
+ 1 | 0 | 4 | 0 | |
+ 1 | 0 | 5 | 0 | |
+ 1 | 0 | 6 | 0 | |
+ 1 | 0 | 7 | 0 | |
+ 1 | 0 | 8 | 0 | |
+ 1 | 0 | 9 | 0 | |
+ 1 | 0 | 10 | 0 | |
+ 1 | 0 | 11 | 0 | |
+ 1 | 0 | 12 | 0 | |
+ 1 | 0 | 13 | 0 | |
+ 2 | 0 | 1 | 0 | |
+ 2 | 0 | 2 | 0 | |
+ 2 | 0 | 3 | 0 | |
+ 2 | 0 | 4 | 0 | |
+ 2 | 0 | 5 | 0 | |
+ 2 | 0 | 6 | 0 | |
+ 2 | 0 | 7 | 0 | |
+ 2 | 0 | 8 | 0 | |
+ 2 | 0 | 9 | 0 | |
+ 2 | 0 | 10 | 0 | |
+ 2 | 0 | 11 | 0 | |
+ 2 | 0 | 12 | 0 | |
+ 2 | 0 | 13 | 0 | |
+ 3 | 0 | 1 | 0 | |
+ 3 | 0 | 2 | 0 | |
+ 3 | 0 | 3 | 0 | |
+ 3 | 0 | 4 | 0 | |
+ 3 | 0 | 5 | 0 | |
+ 3 | 0 | 6 | 0 | |
+ 3 | 0 | 7 | 0 | |
+ 3 | 0 | 8 | 0 | |
+ 3 | 0 | 9 | 0 | |
+ 3 | 0 | 10 | 0 | |
+ 3 | 0 | 11 | 0 | |
+ 3 | 0 | 12 | 0 | |
+ 3 | 0 | 13 | 0 | |
+ 4 | 0 | 1 | 0 | |
+ 4 | 0 | 2 | 0 | |
+ 4 | 0 | 3 | 0 | |
+ 4 | 0 | 4 | 0 | |
+ 4 | 0 | 5 | 0 | |
+ 4 | 0 | 6 | 0 | |
+ 4 | 0 | 7 | 0 | |
+ 4 | 0 | 8 | 0 | |
+ 4 | 0 | 9 | 0 | |
+ 4 | 0 | 10 | 0 | |
+ 4 | 0 | 11 | 0 | |
+ 4 | 0 | 12 | 0 | |
+ 4 | 0 | 13 | 0 | |
+ 5 | 0 | 1 | 0 | |
+ 5 | 0 | 2 | 0 | |
+ 5 | 0 | 3 | 0 | |
+ 5 | 0 | 4 | 0 | |
+ 5 | 0 | 5 | 0 | |
+ 5 | 0 | 6 | 0 | |
+ 5 | 0 | 7 | 0 | |
+ 5 | 0 | 8 | 0 | |
+ 5 | 0 | 9 | 0 | |
+ 5 | 0 | 10 | 0 | |
+ 5 | 0 | 11 | 0 | |
+ 5 | 0 | 12 | 0 | |
+ 5 | 0 | 13 | 0 | |
+ 6 | 0 | 1 | 0 | |
+ 6 | 0 | 2 | 0 | |
+ 6 | 0 | 3 | 0 | |
+ 6 | 0 | 4 | 0 | |
+ 6 | 0 | 5 | 0 | |
+ 6 | 0 | 6 | 0 | |
+ 6 | 0 | 7 | 0 | |
+ 6 | 0 | 8 | 0 | |
+ 6 | 0 | 9 | 0 | |
+ 6 | 0 | 10 | 0 | |
+ 6 | 0 | 11 | 0 | |
+ 6 | 0 | 12 | 0 | |
+ 6 | 0 | 13 | 0 | |
+ 7 | 0 | 1 | 0 | |
+ 7 | 0 | 2 | 0 | |
+ 7 | 0 | 3 | 0 | |
+ 7 | 0 | 4 | 0 | |
+ 7 | 0 | 5 | 0 | |
+ 7 | 0 | 6 | 0 | |
+ 7 | 0 | 7 | 0 | |
+ 7 | 0 | 8 | 0 | |
+ 7 | 0 | 9 | 0 | |
+ 7 | 0 | 10 | 0 | |
+ 7 | 0 | 11 | 0 | |
+ 7 | 0 | 12 | 0 | |
+ 7 | 0 | 13 | 0 | |
+ 8 | 0 | 1 | 0 | |
+ 8 | 0 | 2 | 0 | |
+ 8 | 0 | 3 | 0 | |
+ 8 | 0 | 4 | 0 | |
+ 8 | 0 | 5 | 0 | |
+ 8 | 0 | 6 | 0 | |
+ 8 | 0 | 7 | 0 | |
+ 8 | 0 | 8 | 0 | |
+ 8 | 0 | 9 | 0 | |
+ 8 | 0 | 10 | 0 | |
+ 8 | 0 | 11 | 0 | |
+ 8 | 0 | 12 | 0 | |
+ 8 | 0 | 13 | 0 | |
+ 9 | 0 | 1 | 0 | |
+ 9 | 0 | 2 | 0 | |
+ 9 | 0 | 3 | 0 | |
+ 9 | 0 | 4 | 0 | |
+ 9 | 0 | 5 | 0 | |
+ 9 | 0 | 6 | 0 | |
+ 9 | 0 | 7 | 0 | |
+ 9 | 0 | 8 | 0 | |
+ 9 | 0 | 9 | 0 | |
+ 9 | 0 | 10 | 0 | |
+ 9 | 0 | 11 | 0 | |
+ 9 | 0 | 12 | 0 | |
+ 9 | 0 | 13 | 0 | |
+ 10 | 0 | 1 | 0 | |
+ 10 | 0 | 2 | 0 | |
+ 10 | 0 | 3 | 0 | |
+ 10 | 0 | 4 | 0 | |
+ 10 | 0 | 5 | 0 | |
+ 10 | 0 | 6 | 0 | |
+ 10 | 0 | 7 | 0 | |
+ 10 | 0 | 8 | 0 | |
+ 10 | 0 | 9 | 0 | |
+ 10 | 0 | 10 | 0 | |
+ 10 | 0 | 11 | 0 | |
+ 10 | 0 | 12 | 0 | |
+ 10 | 0 | 13 | 0 | |
+ 11 | 0 | 1 | 0 | |
+ 11 | 0 | 2 | 0 | |
+ 11 | 0 | 3 | 0 | |
+ 11 | 0 | 4 | 0 | |
+ 11 | 0 | 5 | 0 | |
+ 11 | 0 | 6 | 0 | |
+ 11 | 0 | 7 | 0 | |
+ 11 | 0 | 8 | 0 | |
+ 11 | 0 | 9 | 0 | |
+ 11 | 0 | 10 | 0 | |
+ 11 | 0 | 11 | 0 | |
+ 11 | 0 | 12 | 0 | |
+ 11 | 0 | 13 | 0 | |
+ 12 | 0 | 1 | 0 | |
+ 12 | 0 | 2 | 0 | |
+ 12 | 0 | 3 | 0 | |
+ 12 | 0 | 4 | 0 | |
+ 12 | 0 | 5 | 0 | |
+ 12 | 0 | 6 | 0 | |
+ 12 | 0 | 7 | 0 | |
+ 12 | 0 | 8 | 0 | |
+ 12 | 0 | 9 | 0 | |
+ 12 | 0 | 10 | 0 | |
+ 12 | 0 | 11 | 0 | |
+ 12 | 0 | 12 | 0 | |
+ 12 | 0 | 13 | 0 | |
+ 13 | 0 | 1 | 0 | |
+ 13 | 0 | 2 | 0 | |
+ 13 | 0 | 3 | 0 | |
+ 13 | 0 | 4 | 0 | |
+ 13 | 0 | 5 | 0 | |
+ 13 | 0 | 6 | 0 | |
+ 13 | 0 | 7 | 0 | |
+ 13 | 0 | 8 | 0 | |
+ 13 | 0 | 9 | 0 | |
+ 13 | 0 | 10 | 0 | |
+ 13 | 0 | 11 | 0 | |
+ 13 | 0 | 12 | 0 | |
+ 13 | 0 | 13 | 0 | |
+ 14 | 0 | 1 | 0 | |
+ 14 | 0 | 2 | 0 | |
+ 14 | 0 | 3 | 0 | |
+ 14 | 0 | 4 | 0 | |
+ 14 | 0 | 5 | 0 | |
+ 14 | 0 | 6 | 0 | |
+ 14 | 0 | 7 | 0 | |
+ 14 | 0 | 8 | 0 | |
+ 14 | 0 | 9 | 0 | |
+ 14 | 0 | 10 | 0 | |
+ 14 | 0 | 11 | 0 | |
+ 14 | 0 | 12 | 0 | |
+ 14 | 0 | 13 | 0 | |
+ 15 | 0 | 1 | 0 | |
+ 15 | 0 | 2 | 0 | |
+ 15 | 0 | 3 | 0 | |
+ 15 | 0 | 4 | 0 | |
+ 15 | 0 | 5 | 0 | |
+ 15 | 0 | 6 | 0 | |
+ 15 | 0 | 7 | 0 | |
+ 15 | 0 | 8 | 0 | |
+ 15 | 0 | 9 | 0 | |
+ 15 | 0 | 10 | 0 | |
+ 15 | 0 | 11 | 0 | |
+ 15 | 0 | 12 | 0 | |
+ 15 | 0 | 13 | 0 | |
+ 16 | 0 | 1 | 0 | |
+ 16 | 0 | 2 | 0 | |
+ 16 | 0 | 3 | 0 | |
+ 16 | 0 | 4 | 0 | |
+ 16 | 0 | 5 | 0 | |
+ 16 | 0 | 6 | 0 | |
+ 16 | 0 | 7 | 0 | |
+ 16 | 0 | 8 | 0 | |
+ 16 | 0 | 9 | 0 | |
+ 16 | 0 | 10 | 0 | |
+ 16 | 0 | 11 | 0 | |
+ 16 | 0 | 12 | 0 | |
+ 16 | 0 | 13 | 0 | |
+ 17 | 0 | 1 | 0 | |
+ 17 | 0 | 2 | 0 | |
+ 17 | 0 | 3 | 0 | |
+ 17 | 0 | 4 | 0 | |
+ 17 | 0 | 5 | 0 | |
+ 17 | 0 | 6 | 0 | |
+ 17 | 0 | 7 | 0 | |
+ 17 | 0 | 8 | 0 | |
+ 17 | 0 | 9 | 0 | |
+ 17 | 0 | 10 | 0 | |
+ 17 | 0 | 11 | 0 | |
+ 17 | 0 | 12 | 0 | |
+ 17 | 0 | 13 | 0 | |
+ 18 | 0 | 1 | 0 | |
+ 18 | 0 | 2 | 0 | |
+ 18 | 0 | 3 | 0 | |
+ 18 | 0 | 4 | 0 | |
+ 18 | 0 | 5 | 0 | |
+ 18 | 0 | 6 | 0 | |
+ 18 | 0 | 7 | 0 | |
+ 18 | 0 | 8 | 0 | |
+ 18 | 0 | 9 | 0 | |
+ 18 | 0 | 10 | 0 | |
+ 18 | 0 | 11 | 0 | |
+ 18 | 0 | 12 | 0 | |
+ 18 | 0 | 13 | 0 | |
+ 19 | 0 | 1 | 0 | |
+ 19 | 0 | 2 | 0 | |
+ 19 | 0 | 3 | 0 | |
+ 19 | 0 | 4 | 0 | |
+ 19 | 0 | 5 | 0 | |
+ 19 | 0 | 6 | 0 | |
+ 19 | 0 | 7 | 0 | |
+ 19 | 0 | 8 | 0 | |
+ 19 | 0 | 9 | 0 | |
+ 19 | 0 | 10 | 0 | |
+ 19 | 0 | 11 | 0 | |
+ 19 | 0 | 12 | 0 | |
+ 19 | 0 | 13 | 0 | |
+ 20 | 0 | 1 | 0 | |
+ 20 | 0 | 2 | 0 | |
+ 20 | 0 | 3 | 0 | |
+ 20 | 0 | 4 | 0 | |
+ 20 | 0 | 5 | 0 | |
+ 20 | 0 | 6 | 0 | |
+ 20 | 0 | 7 | 0 | |
+ 20 | 0 | 8 | 0 | |
+ 20 | 0 | 9 | 0 | |
+ 20 | 0 | 10 | 0 | |
+ 20 | 0 | 11 | 0 | |
+ 20 | 0 | 12 | 0 | |
+ 20 | 0 | 13 | 0 | |
+ 21 | 0 | 1 | 0 | |
+ 21 | 0 | 2 | 0 | |
+ 21 | 0 | 3 | 0 | |
+ 21 | 0 | 4 | 0 | |
+ 21 | 0 | 5 | 0 | |
+ 21 | 0 | 6 | 0 | |
+ 21 | 0 | 7 | 0 | |
+ 21 | 0 | 8 | 0 | |
+ 21 | 0 | 9 | 0 | |
+ 21 | 0 | 10 | 0 | |
+ 21 | 0 | 11 | 0 | |
+ 21 | 0 | 12 | 0 | |
+ 21 | 0 | 13 | 0 | |
+ 22 | 0 | 1 | 0 | |
+ 22 | 0 | 2 | 0 | |
+ 22 | 0 | 3 | 0 | |
+ 22 | 0 | 4 | 0 | |
+ 22 | 0 | 5 | 0 | |
+ 22 | 0 | 6 | 0 | |
+ 22 | 0 | 7 | 0 | |
+ 22 | 0 | 8 | 0 | |
+ 22 | 0 | 9 | 0 | |
+ 22 | 0 | 10 | 0 | |
+ 22 | 0 | 11 | 0 | |
+ 22 | 0 | 12 | 0 | |
+ 22 | 0 | 13 | 0 | |
+ 23 | 0 | 1 | 0 | |
+ 23 | 0 | 2 | 0 | |
+ 23 | 0 | 3 | 0 | |
+ 23 | 0 | 4 | 0 | |
+ 23 | 0 | 5 | 0 | |
+ 23 | 0 | 6 | 0 | |
+ 23 | 0 | 7 | 0 | |
+ 23 | 0 | 8 | 0 | |
+ 23 | 0 | 9 | 0 | |
+ 23 | 0 | 10 | 0 | |
+ 23 | 0 | 11 | 0 | |
+ 23 | 0 | 12 | 0 | |
+ 23 | 0 | 13 | 0 | |
+ 24 | 0 | 1 | 0 | |
+ 24 | 0 | 2 | 0 | |
+ 24 | 0 | 3 | 0 | |
+ 24 | 0 | 4 | 0 | |
+ 24 | 0 | 5 | 0 | |
+ 24 | 0 | 6 | 0 | |
+ 24 | 0 | 7 | 0 | |
+ 24 | 0 | 8 | 0 | |
+ 24 | 0 | 9 | 0 | |
+ 24 | 0 | 10 | 0 | |
+ 24 | 0 | 11 | 0 | |
+ 24 | 0 | 12 | 0 | |
+ 24 | 0 | 13 | 0 | |
+ 25 | 0 | 1 | 0 | |
+ 25 | 0 | 2 | 0 | |
+ 25 | 0 | 3 | 0 | |
+ 25 | 0 | 4 | 0 | |
+ 25 | 0 | 5 | 0 | |
+ 25 | 0 | 6 | 0 | |
+ 25 | 0 | 7 | 0 | |
+ 25 | 0 | 8 | 0 | |
+ 25 | 0 | 9 | 0 | |
+ 25 | 0 | 10 | 0 | |
+ 25 | 0 | 11 | 0 | |
+ 25 | 0 | 12 | 0 | |
+ 25 | 0 | 13 | 0 | |
+ 26 | 0 | 1 | 0 | |
+ 26 | 0 | 2 | 0 | |
+ 26 | 0 | 3 | 0 | |
+ 26 | 0 | 4 | 0 | |
+ 26 | 0 | 5 | 0 | |
+ 26 | 0 | 6 | 0 | |
+ 26 | 0 | 7 | 0 | |
+ 26 | 0 | 8 | 0 | |
+ 26 | 0 | 9 | 0 | |
+ 26 | 0 | 10 | 0 | |
+ 26 | 0 | 11 | 0 | |
+ 26 | 0 | 12 | 0 | |
+ 26 | 0 | 13 | 0 | |
+ 27 | 0 | 1 | 0 | |
+ 27 | 0 | 2 | 0 | |
+ 27 | 0 | 3 | 0 | |
+ 27 | 0 | 4 | 0 | |
+ 27 | 0 | 5 | 0 | |
+ 27 | 0 | 6 | 0 | |
+ 27 | 0 | 7 | 0 | |
+ 27 | 0 | 8 | 0 | |
+ 27 | 0 | 9 | 0 | |
+ 27 | 0 | 10 | 0 | |
+ 27 | 0 | 11 | 0 | |
+ 27 | 0 | 12 | 0 | |
+ 27 | 0 | 13 | 0 | |
+ | | 14 | 0 | unmatched outer |
+(352 rows)
+
+set local min_parallel_table_scan_size = 0;
+set local parallel_setup_cost = 0;
+set local enable_hashjoin = on;
+savepoint settings;
+set max_parallel_workers_per_gather = 1;
+set enable_parallel_hash = on;
+set work_mem = '64kB';
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside_batch0
+LEFT OUTER JOIN hashside_wide_batch0 USING (a);
+ QUERY PLAN
+--------------------------------------------------------------------------------------
+ Gather (actual rows=469 loops=1)
+ Workers Planned: 1
+ Workers Launched: 1
+ -> Parallel Hash Left Join (actual rows=234 loops=2)
+ Hash Cond: (probeside_batch0.a = hashside_wide_batch0.a)
+ -> Parallel Seq Scan on probeside_batch0 (actual rows=14 loops=1)
+ -> Parallel Hash (actual rows=18 loops=2)
+ Buckets: 8 (originally 8) Batches: 16 (originally 8)
+ Batch: 0 Stripes: 5
+ -> Parallel Seq Scan on hashside_wide_batch0 (actual rows=36 loops=1)
+(10 rows)
+
+SELECT
+ hashside_wide_batch0.id as hashside_id,
+ (hashside_wide_batch0.a).hash as hashside_hash,
+ probeside_batch0.id as probeside_id,
+ (probeside_batch0.a).hash as probeside_hash,
+ TRIM((probeside_batch0.a).value) as probeside_trimmed_value,
+ TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value
+FROM probeside_batch0
+LEFT OUTER JOIN hashside_wide_batch0 USING (a)
+ORDER BY 1, 2, 3, 4, 5, 6;
+ hashside_id | hashside_hash | probeside_id | probeside_hash | probeside_trimmed_value | hashside_trimmed_value
+-------------+---------------+--------------+----------------+-------------------------+------------------------
+ 1 | 0 | 1 | 0 | |
+ 1 | 0 | 2 | 0 | |
+ 1 | 0 | 3 | 0 | |
+ 1 | 0 | 4 | 0 | |
+ 1 | 0 | 5 | 0 | |
+ 1 | 0 | 6 | 0 | |
+ 1 | 0 | 7 | 0 | |
+ 1 | 0 | 8 | 0 | |
+ 1 | 0 | 9 | 0 | |
+ 1 | 0 | 10 | 0 | |
+ 1 | 0 | 11 | 0 | |
+ 1 | 0 | 12 | 0 | |
+ 1 | 0 | 13 | 0 | |
+ 2 | 0 | 1 | 0 | |
+ 2 | 0 | 2 | 0 | |
+ 2 | 0 | 3 | 0 | |
+ 2 | 0 | 4 | 0 | |
+ 2 | 0 | 5 | 0 | |
+ 2 | 0 | 6 | 0 | |
+ 2 | 0 | 7 | 0 | |
+ 2 | 0 | 8 | 0 | |
+ 2 | 0 | 9 | 0 | |
+ 2 | 0 | 10 | 0 | |
+ 2 | 0 | 11 | 0 | |
+ 2 | 0 | 12 | 0 | |
+ 2 | 0 | 13 | 0 | |
+ 3 | 0 | 1 | 0 | |
+ 3 | 0 | 2 | 0 | |
+ 3 | 0 | 3 | 0 | |
+ 3 | 0 | 4 | 0 | |
+ 3 | 0 | 5 | 0 | |
+ 3 | 0 | 6 | 0 | |
+ 3 | 0 | 7 | 0 | |
+ 3 | 0 | 8 | 0 | |
+ 3 | 0 | 9 | 0 | |
+ 3 | 0 | 10 | 0 | |
+ 3 | 0 | 11 | 0 | |
+ 3 | 0 | 12 | 0 | |
+ 3 | 0 | 13 | 0 | |
+ 4 | 0 | 1 | 0 | |
+ 4 | 0 | 2 | 0 | |
+ 4 | 0 | 3 | 0 | |
+ 4 | 0 | 4 | 0 | |
+ 4 | 0 | 5 | 0 | |
+ 4 | 0 | 6 | 0 | |
+ 4 | 0 | 7 | 0 | |
+ 4 | 0 | 8 | 0 | |
+ 4 | 0 | 9 | 0 | |
+ 4 | 0 | 10 | 0 | |
+ 4 | 0 | 11 | 0 | |
+ 4 | 0 | 12 | 0 | |
+ 4 | 0 | 13 | 0 | |
+ 5 | 0 | 1 | 0 | |
+ 5 | 0 | 2 | 0 | |
+ 5 | 0 | 3 | 0 | |
+ 5 | 0 | 4 | 0 | |
+ 5 | 0 | 5 | 0 | |
+ 5 | 0 | 6 | 0 | |
+ 5 | 0 | 7 | 0 | |
+ 5 | 0 | 8 | 0 | |
+ 5 | 0 | 9 | 0 | |
+ 5 | 0 | 10 | 0 | |
+ 5 | 0 | 11 | 0 | |
+ 5 | 0 | 12 | 0 | |
+ 5 | 0 | 13 | 0 | |
+ 6 | 0 | 1 | 0 | |
+ 6 | 0 | 2 | 0 | |
+ 6 | 0 | 3 | 0 | |
+ 6 | 0 | 4 | 0 | |
+ 6 | 0 | 5 | 0 | |
+ 6 | 0 | 6 | 0 | |
+ 6 | 0 | 7 | 0 | |
+ 6 | 0 | 8 | 0 | |
+ 6 | 0 | 9 | 0 | |
+ 6 | 0 | 10 | 0 | |
+ 6 | 0 | 11 | 0 | |
+ 6 | 0 | 12 | 0 | |
+ 6 | 0 | 13 | 0 | |
+ 7 | 0 | 1 | 0 | |
+ 7 | 0 | 2 | 0 | |
+ 7 | 0 | 3 | 0 | |
+ 7 | 0 | 4 | 0 | |
+ 7 | 0 | 5 | 0 | |
+ 7 | 0 | 6 | 0 | |
+ 7 | 0 | 7 | 0 | |
+ 7 | 0 | 8 | 0 | |
+ 7 | 0 | 9 | 0 | |
+ 7 | 0 | 10 | 0 | |
+ 7 | 0 | 11 | 0 | |
+ 7 | 0 | 12 | 0 | |
+ 7 | 0 | 13 | 0 | |
+ 8 | 0 | 1 | 0 | |
+ 8 | 0 | 2 | 0 | |
+ 8 | 0 | 3 | 0 | |
+ 8 | 0 | 4 | 0 | |
+ 8 | 0 | 5 | 0 | |
+ 8 | 0 | 6 | 0 | |
+ 8 | 0 | 7 | 0 | |
+ 8 | 0 | 8 | 0 | |
+ 8 | 0 | 9 | 0 | |
+ 8 | 0 | 10 | 0 | |
+ 8 | 0 | 11 | 0 | |
+ 8 | 0 | 12 | 0 | |
+ 8 | 0 | 13 | 0 | |
+ 9 | 0 | 1 | 0 | |
+ 9 | 0 | 2 | 0 | |
+ 9 | 0 | 3 | 0 | |
+ 9 | 0 | 4 | 0 | |
+ 9 | 0 | 5 | 0 | |
+ 9 | 0 | 6 | 0 | |
+ 9 | 0 | 7 | 0 | |
+ 9 | 0 | 8 | 0 | |
+ 9 | 0 | 9 | 0 | |
+ 9 | 0 | 10 | 0 | |
+ 9 | 0 | 11 | 0 | |
+ 9 | 0 | 12 | 0 | |
+ 9 | 0 | 13 | 0 | |
+ 10 | 0 | 1 | 0 | |
+ 10 | 0 | 2 | 0 | |
+ 10 | 0 | 3 | 0 | |
+ 10 | 0 | 4 | 0 | |
+ 10 | 0 | 5 | 0 | |
+ 10 | 0 | 6 | 0 | |
+ 10 | 0 | 7 | 0 | |
+ 10 | 0 | 8 | 0 | |
+ 10 | 0 | 9 | 0 | |
+ 10 | 0 | 10 | 0 | |
+ 10 | 0 | 11 | 0 | |
+ 10 | 0 | 12 | 0 | |
+ 10 | 0 | 13 | 0 | |
+ 11 | 0 | 1 | 0 | |
+ 11 | 0 | 2 | 0 | |
+ 11 | 0 | 3 | 0 | |
+ 11 | 0 | 4 | 0 | |
+ 11 | 0 | 5 | 0 | |
+ 11 | 0 | 6 | 0 | |
+ 11 | 0 | 7 | 0 | |
+ 11 | 0 | 8 | 0 | |
+ 11 | 0 | 9 | 0 | |
+ 11 | 0 | 10 | 0 | |
+ 11 | 0 | 11 | 0 | |
+ 11 | 0 | 12 | 0 | |
+ 11 | 0 | 13 | 0 | |
+ 12 | 0 | 1 | 0 | |
+ 12 | 0 | 2 | 0 | |
+ 12 | 0 | 3 | 0 | |
+ 12 | 0 | 4 | 0 | |
+ 12 | 0 | 5 | 0 | |
+ 12 | 0 | 6 | 0 | |
+ 12 | 0 | 7 | 0 | |
+ 12 | 0 | 8 | 0 | |
+ 12 | 0 | 9 | 0 | |
+ 12 | 0 | 10 | 0 | |
+ 12 | 0 | 11 | 0 | |
+ 12 | 0 | 12 | 0 | |
+ 12 | 0 | 13 | 0 | |
+ 13 | 0 | 1 | 0 | |
+ 13 | 0 | 2 | 0 | |
+ 13 | 0 | 3 | 0 | |
+ 13 | 0 | 4 | 0 | |
+ 13 | 0 | 5 | 0 | |
+ 13 | 0 | 6 | 0 | |
+ 13 | 0 | 7 | 0 | |
+ 13 | 0 | 8 | 0 | |
+ 13 | 0 | 9 | 0 | |
+ 13 | 0 | 10 | 0 | |
+ 13 | 0 | 11 | 0 | |
+ 13 | 0 | 12 | 0 | |
+ 13 | 0 | 13 | 0 | |
+ 14 | 0 | 1 | 0 | |
+ 14 | 0 | 2 | 0 | |
+ 14 | 0 | 3 | 0 | |
+ 14 | 0 | 4 | 0 | |
+ 14 | 0 | 5 | 0 | |
+ 14 | 0 | 6 | 0 | |
+ 14 | 0 | 7 | 0 | |
+ 14 | 0 | 8 | 0 | |
+ 14 | 0 | 9 | 0 | |
+ 14 | 0 | 10 | 0 | |
+ 14 | 0 | 11 | 0 | |
+ 14 | 0 | 12 | 0 | |
+ 14 | 0 | 13 | 0 | |
+ 15 | 0 | 1 | 0 | |
+ 15 | 0 | 2 | 0 | |
+ 15 | 0 | 3 | 0 | |
+ 15 | 0 | 4 | 0 | |
+ 15 | 0 | 5 | 0 | |
+ 15 | 0 | 6 | 0 | |
+ 15 | 0 | 7 | 0 | |
+ 15 | 0 | 8 | 0 | |
+ 15 | 0 | 9 | 0 | |
+ 15 | 0 | 10 | 0 | |
+ 15 | 0 | 11 | 0 | |
+ 15 | 0 | 12 | 0 | |
+ 15 | 0 | 13 | 0 | |
+ 16 | 0 | 1 | 0 | |
+ 16 | 0 | 2 | 0 | |
+ 16 | 0 | 3 | 0 | |
+ 16 | 0 | 4 | 0 | |
+ 16 | 0 | 5 | 0 | |
+ 16 | 0 | 6 | 0 | |
+ 16 | 0 | 7 | 0 | |
+ 16 | 0 | 8 | 0 | |
+ 16 | 0 | 9 | 0 | |
+ 16 | 0 | 10 | 0 | |
+ 16 | 0 | 11 | 0 | |
+ 16 | 0 | 12 | 0 | |
+ 16 | 0 | 13 | 0 | |
+ 17 | 0 | 1 | 0 | |
+ 17 | 0 | 2 | 0 | |
+ 17 | 0 | 3 | 0 | |
+ 17 | 0 | 4 | 0 | |
+ 17 | 0 | 5 | 0 | |
+ 17 | 0 | 6 | 0 | |
+ 17 | 0 | 7 | 0 | |
+ 17 | 0 | 8 | 0 | |
+ 17 | 0 | 9 | 0 | |
+ 17 | 0 | 10 | 0 | |
+ 17 | 0 | 11 | 0 | |
+ 17 | 0 | 12 | 0 | |
+ 17 | 0 | 13 | 0 | |
+ 18 | 0 | 1 | 0 | |
+ 18 | 0 | 2 | 0 | |
+ 18 | 0 | 3 | 0 | |
+ 18 | 0 | 4 | 0 | |
+ 18 | 0 | 5 | 0 | |
+ 18 | 0 | 6 | 0 | |
+ 18 | 0 | 7 | 0 | |
+ 18 | 0 | 8 | 0 | |
+ 18 | 0 | 9 | 0 | |
+ 18 | 0 | 10 | 0 | |
+ 18 | 0 | 11 | 0 | |
+ 18 | 0 | 12 | 0 | |
+ 18 | 0 | 13 | 0 | |
+ 19 | 0 | 1 | 0 | |
+ 19 | 0 | 2 | 0 | |
+ 19 | 0 | 3 | 0 | |
+ 19 | 0 | 4 | 0 | |
+ 19 | 0 | 5 | 0 | |
+ 19 | 0 | 6 | 0 | |
+ 19 | 0 | 7 | 0 | |
+ 19 | 0 | 8 | 0 | |
+ 19 | 0 | 9 | 0 | |
+ 19 | 0 | 10 | 0 | |
+ 19 | 0 | 11 | 0 | |
+ 19 | 0 | 12 | 0 | |
+ 19 | 0 | 13 | 0 | |
+ 20 | 0 | 1 | 0 | |
+ 20 | 0 | 2 | 0 | |
+ 20 | 0 | 3 | 0 | |
+ 20 | 0 | 4 | 0 | |
+ 20 | 0 | 5 | 0 | |
+ 20 | 0 | 6 | 0 | |
+ 20 | 0 | 7 | 0 | |
+ 20 | 0 | 8 | 0 | |
+ 20 | 0 | 9 | 0 | |
+ 20 | 0 | 10 | 0 | |
+ 20 | 0 | 11 | 0 | |
+ 20 | 0 | 12 | 0 | |
+ 20 | 0 | 13 | 0 | |
+ 21 | 0 | 1 | 0 | |
+ 21 | 0 | 2 | 0 | |
+ 21 | 0 | 3 | 0 | |
+ 21 | 0 | 4 | 0 | |
+ 21 | 0 | 5 | 0 | |
+ 21 | 0 | 6 | 0 | |
+ 21 | 0 | 7 | 0 | |
+ 21 | 0 | 8 | 0 | |
+ 21 | 0 | 9 | 0 | |
+ 21 | 0 | 10 | 0 | |
+ 21 | 0 | 11 | 0 | |
+ 21 | 0 | 12 | 0 | |
+ 21 | 0 | 13 | 0 | |
+ 22 | 0 | 1 | 0 | |
+ 22 | 0 | 2 | 0 | |
+ 22 | 0 | 3 | 0 | |
+ 22 | 0 | 4 | 0 | |
+ 22 | 0 | 5 | 0 | |
+ 22 | 0 | 6 | 0 | |
+ 22 | 0 | 7 | 0 | |
+ 22 | 0 | 8 | 0 | |
+ 22 | 0 | 9 | 0 | |
+ 22 | 0 | 10 | 0 | |
+ 22 | 0 | 11 | 0 | |
+ 22 | 0 | 12 | 0 | |
+ 22 | 0 | 13 | 0 | |
+ 23 | 0 | 1 | 0 | |
+ 23 | 0 | 2 | 0 | |
+ 23 | 0 | 3 | 0 | |
+ 23 | 0 | 4 | 0 | |
+ 23 | 0 | 5 | 0 | |
+ 23 | 0 | 6 | 0 | |
+ 23 | 0 | 7 | 0 | |
+ 23 | 0 | 8 | 0 | |
+ 23 | 0 | 9 | 0 | |
+ 23 | 0 | 10 | 0 | |
+ 23 | 0 | 11 | 0 | |
+ 23 | 0 | 12 | 0 | |
+ 23 | 0 | 13 | 0 | |
+ 24 | 0 | 1 | 0 | |
+ 24 | 0 | 2 | 0 | |
+ 24 | 0 | 3 | 0 | |
+ 24 | 0 | 4 | 0 | |
+ 24 | 0 | 5 | 0 | |
+ 24 | 0 | 6 | 0 | |
+ 24 | 0 | 7 | 0 | |
+ 24 | 0 | 8 | 0 | |
+ 24 | 0 | 9 | 0 | |
+ 24 | 0 | 10 | 0 | |
+ 24 | 0 | 11 | 0 | |
+ 24 | 0 | 12 | 0 | |
+ 24 | 0 | 13 | 0 | |
+ 25 | 0 | 1 | 0 | |
+ 25 | 0 | 2 | 0 | |
+ 25 | 0 | 3 | 0 | |
+ 25 | 0 | 4 | 0 | |
+ 25 | 0 | 5 | 0 | |
+ 25 | 0 | 6 | 0 | |
+ 25 | 0 | 7 | 0 | |
+ 25 | 0 | 8 | 0 | |
+ 25 | 0 | 9 | 0 | |
+ 25 | 0 | 10 | 0 | |
+ 25 | 0 | 11 | 0 | |
+ 25 | 0 | 12 | 0 | |
+ 25 | 0 | 13 | 0 | |
+ 26 | 0 | 1 | 0 | |
+ 26 | 0 | 2 | 0 | |
+ 26 | 0 | 3 | 0 | |
+ 26 | 0 | 4 | 0 | |
+ 26 | 0 | 5 | 0 | |
+ 26 | 0 | 6 | 0 | |
+ 26 | 0 | 7 | 0 | |
+ 26 | 0 | 8 | 0 | |
+ 26 | 0 | 9 | 0 | |
+ 26 | 0 | 10 | 0 | |
+ 26 | 0 | 11 | 0 | |
+ 26 | 0 | 12 | 0 | |
+ 26 | 0 | 13 | 0 | |
+ 27 | 0 | 1 | 0 | |
+ 27 | 0 | 2 | 0 | |
+ 27 | 0 | 3 | 0 | |
+ 27 | 0 | 4 | 0 | |
+ 27 | 0 | 5 | 0 | |
+ 27 | 0 | 6 | 0 | |
+ 27 | 0 | 7 | 0 | |
+ 27 | 0 | 8 | 0 | |
+ 27 | 0 | 9 | 0 | |
+ 27 | 0 | 10 | 0 | |
+ 27 | 0 | 11 | 0 | |
+ 27 | 0 | 12 | 0 | |
+ 27 | 0 | 13 | 0 | |
+ 28 | 0 | 1 | 0 | |
+ 28 | 0 | 2 | 0 | |
+ 28 | 0 | 3 | 0 | |
+ 28 | 0 | 4 | 0 | |
+ 28 | 0 | 5 | 0 | |
+ 28 | 0 | 6 | 0 | |
+ 28 | 0 | 7 | 0 | |
+ 28 | 0 | 8 | 0 | |
+ 28 | 0 | 9 | 0 | |
+ 28 | 0 | 10 | 0 | |
+ 28 | 0 | 11 | 0 | |
+ 28 | 0 | 12 | 0 | |
+ 28 | 0 | 13 | 0 | |
+ 29 | 0 | 1 | 0 | |
+ 29 | 0 | 2 | 0 | |
+ 29 | 0 | 3 | 0 | |
+ 29 | 0 | 4 | 0 | |
+ 29 | 0 | 5 | 0 | |
+ 29 | 0 | 6 | 0 | |
+ 29 | 0 | 7 | 0 | |
+ 29 | 0 | 8 | 0 | |
+ 29 | 0 | 9 | 0 | |
+ 29 | 0 | 10 | 0 | |
+ 29 | 0 | 11 | 0 | |
+ 29 | 0 | 12 | 0 | |
+ 29 | 0 | 13 | 0 | |
+ 30 | 0 | 1 | 0 | |
+ 30 | 0 | 2 | 0 | |
+ 30 | 0 | 3 | 0 | |
+ 30 | 0 | 4 | 0 | |
+ 30 | 0 | 5 | 0 | |
+ 30 | 0 | 6 | 0 | |
+ 30 | 0 | 7 | 0 | |
+ 30 | 0 | 8 | 0 | |
+ 30 | 0 | 9 | 0 | |
+ 30 | 0 | 10 | 0 | |
+ 30 | 0 | 11 | 0 | |
+ 30 | 0 | 12 | 0 | |
+ 30 | 0 | 13 | 0 | |
+ 31 | 0 | 1 | 0 | |
+ 31 | 0 | 2 | 0 | |
+ 31 | 0 | 3 | 0 | |
+ 31 | 0 | 4 | 0 | |
+ 31 | 0 | 5 | 0 | |
+ 31 | 0 | 6 | 0 | |
+ 31 | 0 | 7 | 0 | |
+ 31 | 0 | 8 | 0 | |
+ 31 | 0 | 9 | 0 | |
+ 31 | 0 | 10 | 0 | |
+ 31 | 0 | 11 | 0 | |
+ 31 | 0 | 12 | 0 | |
+ 31 | 0 | 13 | 0 | |
+ 32 | 0 | 1 | 0 | |
+ 32 | 0 | 2 | 0 | |
+ 32 | 0 | 3 | 0 | |
+ 32 | 0 | 4 | 0 | |
+ 32 | 0 | 5 | 0 | |
+ 32 | 0 | 6 | 0 | |
+ 32 | 0 | 7 | 0 | |
+ 32 | 0 | 8 | 0 | |
+ 32 | 0 | 9 | 0 | |
+ 32 | 0 | 10 | 0 | |
+ 32 | 0 | 11 | 0 | |
+ 32 | 0 | 12 | 0 | |
+ 32 | 0 | 13 | 0 | |
+ 33 | 0 | 1 | 0 | |
+ 33 | 0 | 2 | 0 | |
+ 33 | 0 | 3 | 0 | |
+ 33 | 0 | 4 | 0 | |
+ 33 | 0 | 5 | 0 | |
+ 33 | 0 | 6 | 0 | |
+ 33 | 0 | 7 | 0 | |
+ 33 | 0 | 8 | 0 | |
+ 33 | 0 | 9 | 0 | |
+ 33 | 0 | 10 | 0 | |
+ 33 | 0 | 11 | 0 | |
+ 33 | 0 | 12 | 0 | |
+ 33 | 0 | 13 | 0 | |
+ 34 | 0 | 1 | 0 | |
+ 34 | 0 | 2 | 0 | |
+ 34 | 0 | 3 | 0 | |
+ 34 | 0 | 4 | 0 | |
+ 34 | 0 | 5 | 0 | |
+ 34 | 0 | 6 | 0 | |
+ 34 | 0 | 7 | 0 | |
+ 34 | 0 | 8 | 0 | |
+ 34 | 0 | 9 | 0 | |
+ 34 | 0 | 10 | 0 | |
+ 34 | 0 | 11 | 0 | |
+ 34 | 0 | 12 | 0 | |
+ 34 | 0 | 13 | 0 | |
+ 35 | 0 | 1 | 0 | |
+ 35 | 0 | 2 | 0 | |
+ 35 | 0 | 3 | 0 | |
+ 35 | 0 | 4 | 0 | |
+ 35 | 0 | 5 | 0 | |
+ 35 | 0 | 6 | 0 | |
+ 35 | 0 | 7 | 0 | |
+ 35 | 0 | 8 | 0 | |
+ 35 | 0 | 9 | 0 | |
+ 35 | 0 | 10 | 0 | |
+ 35 | 0 | 11 | 0 | |
+ 35 | 0 | 12 | 0 | |
+ 35 | 0 | 13 | 0 | |
+ 36 | 0 | 1 | 0 | |
+ 36 | 0 | 2 | 0 | |
+ 36 | 0 | 3 | 0 | |
+ 36 | 0 | 4 | 0 | |
+ 36 | 0 | 5 | 0 | |
+ 36 | 0 | 6 | 0 | |
+ 36 | 0 | 7 | 0 | |
+ 36 | 0 | 8 | 0 | |
+ 36 | 0 | 9 | 0 | |
+ 36 | 0 | 10 | 0 | |
+ 36 | 0 | 11 | 0 | |
+ 36 | 0 | 12 | 0 | |
+ 36 | 0 | 13 | 0 | |
+ | | 14 | 0 | unmatched outer |
+(469 rows)
+
+rollback to settings;
+rollback;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 601734a6f1ec1..2a18dc423e2bf 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1324,6 +1324,16 @@ pg_available_extensions| SELECT e.name,
e.comment
FROM (pg_available_extensions() e(name, default_version, comment)
LEFT JOIN pg_extension x ON ((e.name = x.extname)));
+pg_backend_memory_contexts| SELECT pg_get_backend_memory_contexts.name,
+ pg_get_backend_memory_contexts.ident,
+ pg_get_backend_memory_contexts.parent,
+ pg_get_backend_memory_contexts.level,
+ pg_get_backend_memory_contexts.total_bytes,
+ pg_get_backend_memory_contexts.total_nblocks,
+ pg_get_backend_memory_contexts.free_bytes,
+ pg_get_backend_memory_contexts.free_chunks,
+ pg_get_backend_memory_contexts.used_bytes
+ FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes);
pg_config| SELECT pg_config.name,
pg_config.setting
FROM pg_config() pg_config(name, setting);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 1c5d80da323ea..b81923f2e7410 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -757,6 +757,7 @@ insert into outer_text values ('a', null);
insert into outer_text values ('b', null);
create temp table inner_text (c1 text, c2 text);
insert into inner_text values ('a', null);
+insert into inner_text values ('123', '456');
select * from outer_text where (f1, f2) not in (select * from inner_text);
f1 | f2
----+----
@@ -797,6 +798,82 @@ select '1'::text in (select '1'::name union all select '1'::name);
t
(1 row)
+--
+-- Test that we don't try to use a hashed subplan if the simplified
+-- testexpr isn't of the right shape
+--
+-- this fails by default, of course
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ERROR: operator does not exist: bigint = text
+LINE 1: select * from int8_tbl where q1 in (select c1 from inner_tex...
+ ^
+HINT: No operator matches the given name and argument types. You might need to add explicit type casts.
+begin;
+-- make an operator to allow it to succeed
+create function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2';
+create operator = (procedure=bogus_int8_text_eq, leftarg=int8, rightarg=text);
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ QUERY PLAN
+--------------------------------
+ Seq Scan on int8_tbl
+ Filter: (hashed SubPlan 1)
+ SubPlan 1
+ -> Seq Scan on inner_text
+(4 rows)
+
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ q1 | q2
+-----+------------------
+ 123 | 456
+ 123 | 4567890123456789
+(2 rows)
+
+-- inlining of this function results in unusual number of hash clauses,
+-- which we can still cope with
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2 and $1::text = $2';
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ QUERY PLAN
+--------------------------------
+ Seq Scan on int8_tbl
+ Filter: (hashed SubPlan 1)
+ SubPlan 1
+ -> Seq Scan on inner_text
+(4 rows)
+
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ q1 | q2
+-----+------------------
+ 123 | 456
+ 123 | 4567890123456789
+(2 rows)
+
+-- inlining of this function causes LHS and RHS to be switched,
+-- which we can't cope with, so hashing should be abandoned
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $2 = $1::text';
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ QUERY PLAN
+--------------------------------------
+ Seq Scan on int8_tbl
+ Filter: (SubPlan 1)
+ SubPlan 1
+ -> Materialize
+ -> Seq Scan on inner_text
+(5 rows)
+
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ q1 | q2
+-----+------------------
+ 123 | 456
+ 123 | 4567890123456789
+(2 rows)
+
+rollback; -- to get rid of the bogus operator
--
-- Test case for planner bug with nested EXISTS handling
--
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 06c4c3e476378..1cffc3349d602 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -19,6 +19,15 @@ select count(*) >= 0 as ok from pg_available_extensions;
t
(1 row)
+-- The entire output of pg_backend_memory_contexts is not stable,
+-- we test only the existance and basic condition of TopMemoryContext.
+select name, ident, parent, level, total_bytes >= free_bytes
+ from pg_backend_memory_contexts where level = 0;
+ name | ident | parent | level | ?column?
+------------------+-------+--------+-------+----------
+ TopMemoryContext | | | 0 | t
+(1 row)
+
-- At introduction, pg_config had 23 entries; it may grow
select count(*) > 20 as ok from pg_config;
ok
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index ce6401d80d288..4cc55d852513e 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2252,6 +2252,20 @@ ALTER TABLE ataddindex
\d ataddindex
DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+ ADD PRIMARY KEY (id),
+ ADD FOREIGN KEY (ref_id) REFERENCES ataddindex;
+\d ataddindex
+DROP TABLE ataddindex;
+
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+ ADD UNIQUE (id),
+ ADD FOREIGN KEY (ref_id) REFERENCES ataddindex (id);
+\d ataddindex
+DROP TABLE ataddindex;
+
-- unsupported constraint types for partitioned tables
CREATE TABLE partitioned (
a int,
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
index 1289e76ecb9b5..d1a82474f3f18 100644
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea,
int4rangecol int4range,
lsncol pg_lsn,
boxcol box
-) WITH (fillfactor=10);
+) WITH (fillfactor=10, autovacuum_enabled=off);
INSERT INTO brintest SELECT
repeat(stringu1, 8)::bytea,
diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql
index 6981ac0cbeeed..f0a8a56b76fad 100644
--- a/src/test/regress/sql/create_table_like.sql
+++ b/src/test/regress/sql/create_table_like.sql
@@ -66,7 +66,9 @@ SELECT * FROM test_like_gen_3;
DROP TABLE test_like_gen_1, test_like_gen_2, test_like_gen_3;
-- also test generated column with a "forward" reference (bug #16342)
-CREATE TABLE test_like_4 (b int DEFAULT 42, c int GENERATED ALWAYS AS (a * 2) STORED, a int);
+CREATE TABLE test_like_4 (b int DEFAULT 42,
+ c int GENERATED ALWAYS AS (a * 2) STORED,
+ a int CHECK (a > 0));
\d test_like_4
CREATE TABLE test_like_4a (LIKE test_like_4);
CREATE TABLE test_like_4b (LIKE test_like_4 INCLUDING DEFAULTS);
@@ -84,7 +86,17 @@ SELECT a, b, c FROM test_like_4c;
\d test_like_4d
INSERT INTO test_like_4d (a) VALUES(11);
SELECT a, b, c FROM test_like_4d;
+
+-- Test renumbering of Vars when combining LIKE with inheritance
+CREATE TABLE test_like_5 (x point, y point, z point);
+CREATE TABLE test_like_5x (p int CHECK (p > 0),
+ q int GENERATED ALWAYS AS (p * 2) STORED);
+CREATE TABLE test_like_5c (LIKE test_like_4 INCLUDING ALL)
+ INHERITS (test_like_5, test_like_5x);
+\d test_like_5c
+
DROP TABLE test_like_4, test_like_4a, test_like_4b, test_like_4c, test_like_4d;
+DROP TABLE test_like_5, test_like_5x, test_like_5c;
CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
INSERT INTO inhg VALUES (5, 10);
@@ -119,9 +131,10 @@ CREATE TABLE ctlt2 (c text);
ALTER TABLE ctlt2 ALTER COLUMN c SET STORAGE EXTERNAL;
COMMENT ON COLUMN ctlt2.c IS 'C';
-CREATE TABLE ctlt3 (a text CHECK (length(a) < 5), c text);
+CREATE TABLE ctlt3 (a text CHECK (length(a) < 5), c text CHECK (length(c) < 7));
ALTER TABLE ctlt3 ALTER COLUMN c SET STORAGE EXTERNAL;
ALTER TABLE ctlt3 ALTER COLUMN a SET STORAGE MAIN;
+CREATE INDEX ctlt3_fnidx ON ctlt3 ((a || c));
COMMENT ON COLUMN ctlt3.a IS 'A3';
COMMENT ON COLUMN ctlt3.c IS 'C';
COMMENT ON CONSTRAINT ctlt3_a_check ON ctlt3 IS 't3_a_check';
@@ -138,7 +151,7 @@ CREATE TABLE ctlt1_inh (LIKE ctlt1 INCLUDING CONSTRAINTS INCLUDING COMMENTS) INH
SELECT description FROM pg_description, pg_constraint c WHERE classoid = 'pg_constraint'::regclass AND objoid = c.oid AND c.conrelid = 'ctlt1_inh'::regclass;
CREATE TABLE ctlt13_inh () INHERITS (ctlt1, ctlt3);
\d+ ctlt13_inh
-CREATE TABLE ctlt13_like (LIKE ctlt3 INCLUDING CONSTRAINTS INCLUDING COMMENTS INCLUDING STORAGE) INHERITS (ctlt1);
+CREATE TABLE ctlt13_like (LIKE ctlt3 INCLUDING CONSTRAINTS INCLUDING INDEXES INCLUDING COMMENTS INCLUDING STORAGE) INHERITS (ctlt1);
\d+ ctlt13_like
SELECT description FROM pg_description, pg_constraint c WHERE classoid = 'pg_constraint'::regclass AND objoid = c.oid AND c.conrelid = 'ctlt13_like'::regclass;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index dce2a34207246..c79116c927b15 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -25,6 +25,9 @@ begin
-- Ignore text-mode buffers output because it varies depending
-- on the system state
CONTINUE WHEN (ln ~ ' +Buffers: .*');
+ -- Ignore text-mode "Planning:" line because whether it's output
+ -- varies depending on the system state
+ CONTINUE WHEN (ln = 'Planning:');
return next ln;
end loop;
end;
@@ -57,6 +60,8 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
select explain_filter('explain (analyze, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
+select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
-- SETTINGS option
-- We have to ignore other settings that might be imposed by the environment,
diff --git a/src/test/regress/sql/gin.sql b/src/test/regress/sql/gin.sql
index abe35752652ab..efb8ef3e964cd 100644
--- a/src/test/regress/sql/gin.sql
+++ b/src/test/regress/sql/gin.sql
@@ -138,4 +138,28 @@ from
reset enable_seqscan;
reset enable_bitmapscan;
+-- re-purpose t_gin_test_tbl to test scans involving posting trees
+insert into t_gin_test_tbl select array[1, g, g/10], array[2, g, g/10]
+ from generate_series(1, 20000) g;
+
+select gin_clean_pending_list('t_gin_test_tbl_i_j_idx') is not null;
+
+analyze t_gin_test_tbl;
+
+set enable_seqscan = off;
+set enable_bitmapscan = on;
+
+explain (costs off)
+select count(*) from t_gin_test_tbl where j @> array[50];
+select count(*) from t_gin_test_tbl where j @> array[50];
+explain (costs off)
+select count(*) from t_gin_test_tbl where j @> array[2];
+select count(*) from t_gin_test_tbl where j @> array[2];
+explain (costs off)
+select count(*) from t_gin_test_tbl where j @> '{}'::int[];
+select count(*) from t_gin_test_tbl where j @> '{}'::int[];
+
+reset enable_seqscan;
+reset enable_bitmapscan;
+
drop table t_gin_test_tbl;
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index e6c28743a4411..d4e5628eba8d7 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -172,6 +172,22 @@ select x, not x as not_x, q2 from
group by grouping sets(x, q2)
order by x, q2;
+-- check qual push-down rules for a subquery with grouping sets
+explain (verbose, costs off)
+select * from (
+ select 1 as x, q1, sum(q2)
+ from int8_tbl i1
+ group by grouping sets(1, 2)
+) ss
+where x = 1 and q1 = 123;
+
+select * from (
+ select 1 as x, q1, sum(q2)
+ from int8_tbl i1
+ group by grouping sets(1, 2)
+) ss
+where x = 1 and q1 = 123;
+
-- simple rescan tests
select a, b, sum(v.x)
diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql
index 68c1a8c7b65e4..d9f8a115d85dd 100644
--- a/src/test/regress/sql/join_hash.sql
+++ b/src/test/regress/sql/join_hash.sql
@@ -450,22 +450,26 @@ rollback to settings;
-- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and
-- sts_puttuple oversized tuple cases because it's multi-batch)
-savepoint settings;
-set max_parallel_workers_per_gather = 2;
-set enable_parallel_hash = on;
-set work_mem = '128kB';
-explain (costs off)
- select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
-select length(max(s.t))
-from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
-select final > 1 as multibatch
- from hash_join_batches(
-$$
- select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
-$$);
-rollback to settings;
+-- savepoint settings;
+-- set max_parallel_workers_per_gather = 2;
+-- set enable_parallel_hash = on;
+-- TODO: throw an error when this happens: cannot set work_mem lower than the side of a single tuple
+-- TODO: ensure that oversize tuple code is still exercised (should be with some of the stub stuff below)
+-- TODO: commented this out since it would crash otherwise
+-- this test is no longer multi-batch, so, perhaps, it should be removed
+-- set work_mem = '128kB';
+-- explain (costs off)
+-- select length(max(s.t))
+-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+-- select length(max(s.t))
+-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+-- select final > 1 as multibatch
+-- from hash_join_batches(
+-- $$
+-- select length(max(s.t))
+-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+-- $$);
+-- rollback to settings;
rollback;
@@ -538,3 +542,181 @@ WHERE
AND hjtest_1.a <> hjtest_2.b;
ROLLBACK;
+
+-- Serial Adaptive Hash Join
+
+BEGIN;
+CREATE TYPE stub AS (hash INTEGER, value CHAR(8090));
+
+CREATE FUNCTION stub_hash(item stub)
+RETURNS INTEGER AS $$
+DECLARE
+ batch_size INTEGER;
+BEGIN
+ batch_size := 4;
+ RETURN item.hash << (batch_size - 1);
+END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE;
+
+CREATE FUNCTION stub_eq(item1 stub, item2 stub)
+RETURNS BOOLEAN AS $$
+BEGIN
+ RETURN item1.hash = item2.hash AND item1.value = item2.value;
+END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE;
+
+CREATE OPERATOR = (
+ FUNCTION = stub_eq,
+ LEFTARG = stub,
+ RIGHTARG = stub,
+ COMMUTATOR = =,
+ HASHES, MERGES
+);
+
+CREATE OPERATOR CLASS stub_hash_ops
+DEFAULT FOR TYPE stub USING hash AS
+ OPERATOR 1 =(stub, stub),
+ FUNCTION 1 stub_hash(stub);
+
+CREATE TABLE probeside(a stub);
+ALTER TABLE probeside ALTER COLUMN a SET STORAGE PLAIN;
+-- non-fallback batch with unmatched outer tuple
+INSERT INTO probeside SELECT '(2, "")' FROM generate_series(1, 1);
+-- fallback batch unmatched outer tuple (in first stripe maybe)
+INSERT INTO probeside SELECT '(1, "unmatched outer tuple")' FROM generate_series(1, 1);
+-- fallback batch matched outer tuple
+INSERT INTO probeside SELECT '(1, "")' FROM generate_series(1, 5);
+-- fallback batch unmatched outer tuple (in last stripe maybe)
+-- When numbatches=4, hash 5 maps to batch 1, but after numbatches doubles to
+-- 8 batches hash 5 maps to batch 5.
+INSERT INTO probeside SELECT '(5, "")' FROM generate_series(1, 1);
+-- non-fallback batch matched outer tuple
+INSERT INTO probeside SELECT '(3, "")' FROM generate_series(1, 1);
+-- batch with 3 stripes where non-first/non-last stripe contains unmatched outer tuple
+INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 5);
+INSERT INTO probeside SELECT '(6, "unmatched outer tuple")' FROM generate_series(1, 1);
+INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 1);
+
+CREATE TABLE hashside_wide(a stub, id int);
+ALTER TABLE hashside_wide ALTER COLUMN a SET STORAGE PLAIN;
+-- falls back with an unmatched inner tuple that is in fist, middle, and last
+-- stripe
+INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in first stripe")', 1 FROM generate_series(1, 1);
+INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9);
+INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in middle stripe")', 1 FROM generate_series(1, 1);
+INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9);
+INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in last stripe")', 1 FROM generate_series(1, 1);
+
+-- doesn't fall back -- matched tuple
+INSERT INTO hashside_wide SELECT '(3, "")', 3 FROM generate_series(1, 1);
+INSERT INTO hashside_wide SELECT '(6, "")', 6 FROM generate_series(1, 20);
+
+ANALYZE probeside, hashside_wide;
+
+SET enable_nestloop TO off;
+SET enable_mergejoin TO off;
+SET work_mem = 64;
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a);
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+RIGHT OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+RIGHT OUTER JOIN hashside_wide USING (a);
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+FULL OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+FULL OUTER JOIN hashside_wide USING (a);
+
+-- semi-join testcase
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off)
+SELECT probeside.* FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a);
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value)
+FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2;
+
+-- anti-join testcase
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off)
+SELECT probeside.* FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a);
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value)
+FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2;
+
+-- parallel LOJ test case with two batches falling back
+savepoint settings;
+set local max_parallel_workers_per_gather = 1;
+set local min_parallel_table_scan_size = 0;
+set local parallel_setup_cost = 0;
+set local enable_parallel_hash = on;
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a);
+
+SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value)
+FROM probeside
+LEFT OUTER JOIN hashside_wide USING (a)
+ORDER BY 1, 2, 3, 4, 5;
+rollback to settings;
+
+-- Test spill of batch 0 gives correct results.
+CREATE TABLE probeside_batch0(id int generated always as identity, a stub);
+ALTER TABLE probeside_batch0 ALTER COLUMN a SET STORAGE PLAIN;
+INSERT INTO probeside_batch0(a) SELECT '(0, "")' FROM generate_series(1, 13);
+INSERT INTO probeside_batch0(a) SELECT '(0, "unmatched outer")' FROM generate_series(1, 1);
+
+CREATE TABLE hashside_wide_batch0(id int generated always as identity, a stub);
+ALTER TABLE hashside_wide_batch0 ALTER COLUMN a SET STORAGE PLAIN;
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+ANALYZE probeside_batch0, hashside_wide_batch0;
+
+SELECT
+ hashside_wide_batch0.id as hashside_id,
+ (hashside_wide_batch0.a).hash as hashside_hash,
+ probeside_batch0.id as probeside_id,
+ (probeside_batch0.a).hash as probeside_hash,
+ TRIM((probeside_batch0.a).value) as probeside_trimmed_value,
+ TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value
+FROM probeside_batch0
+LEFT OUTER JOIN hashside_wide_batch0 USING (a)
+ORDER BY 1, 2, 3, 4, 5, 6;
+
+set local min_parallel_table_scan_size = 0;
+set local parallel_setup_cost = 0;
+set local enable_hashjoin = on;
+
+savepoint settings;
+set max_parallel_workers_per_gather = 1;
+set enable_parallel_hash = on;
+set work_mem = '64kB';
+
+INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9);
+
+EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside_batch0
+LEFT OUTER JOIN hashside_wide_batch0 USING (a);
+
+SELECT
+ hashside_wide_batch0.id as hashside_id,
+ (hashside_wide_batch0.a).hash as hashside_hash,
+ probeside_batch0.id as probeside_id,
+ (probeside_batch0.a).hash as probeside_hash,
+ TRIM((probeside_batch0.a).value) as probeside_trimmed_value,
+ TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value
+FROM probeside_batch0
+LEFT OUTER JOIN hashside_wide_batch0 USING (a)
+ORDER BY 1, 2, 3, 4, 5, 6;
+rollback to settings;
+
+rollback;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index a56057bd4fadc..cce8ebdb3d9fc 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -449,6 +449,7 @@ insert into outer_text values ('b', null);
create temp table inner_text (c1 text, c2 text);
insert into inner_text values ('a', null);
+insert into inner_text values ('123', '456');
select * from outer_text where (f1, f2) not in (select * from inner_text);
@@ -468,6 +469,46 @@ select 'foo'::text in (select 'bar'::name union all select 'bar'::name);
select '1'::text in (select '1'::name union all select '1'::name);
+--
+-- Test that we don't try to use a hashed subplan if the simplified
+-- testexpr isn't of the right shape
+--
+
+-- this fails by default, of course
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+begin;
+
+-- make an operator to allow it to succeed
+create function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2';
+
+create operator = (procedure=bogus_int8_text_eq, leftarg=int8, rightarg=text);
+
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+-- inlining of this function results in unusual number of hash clauses,
+-- which we can still cope with
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2 and $1::text = $2';
+
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+-- inlining of this function causes LHS and RHS to be switched,
+-- which we can't cope with, so hashing should be abandoned
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $2 = $1::text';
+
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+rollback; -- to get rid of the bogus operator
+
--
-- Test case for planner bug with nested EXISTS handling
--
diff --git a/src/test/regress/sql/sysviews.sql b/src/test/regress/sql/sysviews.sql
index 28e412b73530b..ac4a0e1cbba7e 100644
--- a/src/test/regress/sql/sysviews.sql
+++ b/src/test/regress/sql/sysviews.sql
@@ -12,6 +12,11 @@ select count(*) >= 0 as ok from pg_available_extension_versions;
select count(*) >= 0 as ok from pg_available_extensions;
+-- The entire output of pg_backend_memory_contexts is not stable,
+-- we test only the existance and basic condition of TopMemoryContext.
+select name, ident, parent, level, total_bytes >= free_bytes
+ from pg_backend_memory_contexts where level = 0;
+
-- At introduction, pg_config had 23 entries; it may grow
select count(*) > 20 as ok from pg_config;
diff --git a/src/test/ssl/t/SSLServer.pm b/src/test/ssl/t/SSLServer.pm
index 1e392b8fbf614..f5987a003efd6 100644
--- a/src/test/ssl/t/SSLServer.pm
+++ b/src/test/ssl/t/SSLServer.pm
@@ -9,7 +9,6 @@
# - a database called trustdb that lets anyone in
# - another database called certdb that uses certificate authentication, ie.
# the client must present a valid certificate signed by the client CA
-# - two users, called ssltestuser and anotheruser.
#
# The server is configured to only accept connections from localhost. If you
# want to run the client from another host, you'll have to configure that