From ac1f7d2c7282b75dc2f827bd885e20cc531e9a94 Mon Sep 17 00:00:00 2001 From: "Karl O. Pinc kop@karlpinc.com" Date: Mon, 13 Oct 2025 17:38:07 +0000 Subject: [PATCH] Support converting a single table using multiple, serial, connections --- conversion/Makefile | 15 ++- conversion/load_chunks.sh | 103 +++++++++++++++ conversion/load_finish.sql | 22 +++ ...low_arrival.sql => load_follow_arrival.m4} | 76 +---------- conversion/load_follow_arrival_sanity.sql | 125 ++++++++++++++++++ 5 files changed, 264 insertions(+), 77 deletions(-) create mode 100755 conversion/load_chunks.sh rename conversion/{load_follow_arrival.sql => load_follow_arrival.m4} (64%) create mode 100644 conversion/load_follow_arrival_sanity.sql diff --git a/conversion/Makefile b/conversion/Makefile index 1f23e01..bf74ad6 100644 --- a/conversion/Makefile +++ b/conversion/Makefile @@ -275,7 +275,7 @@ after_make_clean: load_data load_data: load_support load_biography load_community_membership \ load_biography_update_log load_community_membership_update_log \ load_follow_to_follows load_follow_to_other \ - load_follow_arrival \ + load_follow_arrival_sanity load_follow_arrival \ load_finish .PHONY: load_support @@ -322,12 +322,19 @@ load_follow_to_other: $(PSQL_DEPENDS) cat load_follow_to_other.sql ;) \ | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y -.PHONY: load_follow_arrival -load_follow_arrival: $(PSQL_DEPENDS) +.PHONY: load_follow_arrival_sanity +load_follow_arrival_sanity: $(PSQL_DEPENDS) ( $(PSQL_SETUP) \ - cat load_follow_arrival.sql ;) \ + cat load_follow_arrival_sanity.sql ;) \ | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y +.PHONY: load_follow_arrival +load_follow_arrival: $(PSQL_DEPENDS) + PSQL_SETUP='$(PSQL_SETUP)' \ + PSQL_ARGS='$(PSQL_ARGS)' \ + PSQL_SINGLE_TRANS='$(PSQL_SINGLE_TRANS)' \ + ./load_chunks.sh load_follow_arrival.m4 clean.follow_arrival + .PHONY: load_finish load_finish: $(PSQL_DEPENDS) ( $(PSQL_SETUP) \ diff --git a/conversion/load_chunks.sh b/conversion/load_chunks.sh new file mode 100755 index 0000000..43c9247 --- /dev/null +++ b/conversion/load_chunks.sh @@ -0,0 +1,103 @@ +#!/bin/sh +# Copyright (C) 2025 The Meme Factory, Inc. http://www.karlpinc.com/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Load a file in pieces, serially, using multiple connections. +# +# Syntax: load_chunks.sh m4_sql_source table_name +# +# Input: +# m4_sql_source +# A m4 file containing SQL (psql input) that loads from a +# table. The m4 code should use 2 macros: +# +# m4_chunk_offset_amount +# The value of a SELECT's OFFSET keyword. +# m4_chunk_limit_amount +# The value of a SELECT's LIMIT keyword. +# +# So, the m4 code should contain something like: +# +# INSERT INTO target_table (columns...) +# SELECT columns... +# FROM table_name +# ORDER BY something_unique +# LIMIT m4_chunk_limit_amount +# OFFSET m4_chunk_offset_amount; +# +# table_name +# The name of the table that is the source of the data. +# This should (probably) be schema qualified. +# +# Environment: +# +# PSQL_SETUP +# PSQL_ARGS +# PSQL_SINGLE_TRANS +# These are the same as the corresponding Makefile variables. +# +# Remarks: +# Updating table statistics does not take effect everwhere within +# a single connection to the db, because query plans and function +# plans are cached, per-connection. See: +# https://www.postgresql.org/docs/current/plpgsql-implementation.html#PLPGSQL-PLAN-CACHING +# +# When you load an empty db, there are no statistics. So initial +# query plans are bad. This can make loading into an empty database +# take exponentially increasing amounts of time. +# +# This script is part of a solution. It allows loading of a source +# data table in pieces, doing an analyze after each piece and starting +# a new connection to continue the table load using the new statistics. +# +# This could be done with psql variable substutition, but using m4 seems +# both cleaner and more explicit. +# +# Does a fibnacci progression to load in increasingly larger chunks. +# +# Karl O. Pinc + +# Constants +INITIAL_ROWS=4000 + +# Parse command line +export m4_source="$1" +export table_name="$2" + +# Find the number of rows in the source table +# (This produces a number with a leading space character. We don't care.) +export psql_command="psql ${PSQL_ARGS} --tuples-only" +rows_available=$( (eval ${PSQL_SETUP} ; \ + echo "SELECT COUNT(*) FROM ${table_name};" ;) \ + | eval ${psql_command}) + +export offset=0 +export last_limit=0 +export limit=${INITIAL_ROWS} +psql_command="psql ${PSQL_ARGS} ${PSQL_SINGLE_TRANS} --set=ON_ERROR_STOP=y" +while [ ${offset} -lt ${rows_available} ] ; do + ( eval "${PSQL_SETUP}" + m4 -D m4_chunk_offset_amount=${offset} \ + -D m4_chunk_limit_amount=${limit} ${m4_source} ;) \ + | eval ${psql_command} + + result=$? + [ ${result} -ne 0 ] && exit ${result} + + offset=$((${offset} + ${limit})) + saved_limit=${limit} + limit=$((${limit} + ${last_limit})) + last_limit=${saved_limit} +done diff --git a/conversion/load_finish.sql b/conversion/load_finish.sql index 658c737..29e6553 100644 --- a/conversion/load_finish.sql +++ b/conversion/load_finish.sql @@ -24,6 +24,28 @@ SET search_path TO sokwedb, codes, lib, clean, libconv; -- Finish -- +-- Sanity checks +SELECT 'Sanity checks commented out!'; +DO $$ +BEGIN +if false then +-- Start of follow time is NULL +PERFORM * + FROM clean.follow_arrival + WHERE NOT EXISTS + (SELECT follows.fid + FROM follows + WHERE follows.focal = follow_arrival.fa_fol_b_focal_animid + AND follows.date = follow_arrival.fa_fol_date); +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follow_arrivals where there is no follow'; +END IF; +end if; +END; +$$; + + -- Make the unknown person unusable. UPDATE people SET active = FALSE diff --git a/conversion/load_follow_arrival.sql b/conversion/load_follow_arrival.m4 similarity index 64% rename from conversion/load_follow_arrival.sql rename to conversion/load_follow_arrival.m4 index 7a8f78f..0cbdb74 100644 --- a/conversion/load_follow_arrival.sql +++ b/conversion/load_follow_arrival.m4 @@ -24,58 +24,6 @@ SET search_path TO sokwedb, codes, lib, clean, libconv; -- Follow_arrival -- --- Sanity checks -DO $$ -BEGIN --- Start of follow time is NULL -PERFORM * FROM clean.follow_arrival WHERE fa_time_start IS NULL; -IF FOUND THEN - RAISE data_exception USING - MESSAGE = 'There are follow_arrivals where fa_time_start is NULL'; -END IF; - --- End of follow time is NULL -PERFORM * FROM clean.follow_arrival WHERE fa_time_end IS NULL; -IF FOUND THEN - RAISE data_exception USING - MESSAGE = 'There are follow_arrivals where fa_time_end is NULL'; -END IF; - --- follow_arrival.fa_type_of_nesting has a valid value -PERFORM * - FROM clean.follow_arrival - WHERE (fa_type_of_nesting NOT IN (0, 1, 2, 3) - OR fa_type_of_nesting IS NULL); -IF FOUND THEN - RAISE data_exception USING - MESSAGE = 'There are follow_arrivals where fa_type_of_nesting is not ' - || '0, 1, 2, 3, 4'; -END IF; - --- follow.fol_flag_begin_in_nest is not 0 or 1 -PERFORM * - FROM clean.follow - WHERE fol_flag_begin_in_nest <> 0 AND fol_flag_begin_in_nest <> 1; -IF FOUND THEN - RAISE data_exception USING - MESSAGE = 'There are follows where fol_flag_begin_in_nest is not ' - || '0 or 1'; -END IF; - --- follow.fol_flag_end_in_nest is not 0 or 1 -PERFORM * - FROM clean.follow - WHERE fol_flag_end_in_nest <> 0 AND fol_flag_end_in_nest <> 1; -IF FOUND THEN - RAISE data_exception USING - MESSAGE = 'There are follows where fol_flag_end_in_nest is not ' - || '0 or 1'; -END IF; - -END; -$$; - - -- -- follow_arrival -- @@ -102,6 +50,9 @@ BEGIN ORDER BY follow_arrival.fa_fol_date , follow_arrival.fa_fol_b_focal_animid , follow_arrival.fa_b_arr_animid + , follow_arrival.fa_seq_num + LIMIT m4_chunk_limit_amount + OFFSET m4_chunk_offset_amount LOOP -- Which follow belongs to the follow_arrival? SELECT follows.fid @@ -172,27 +123,6 @@ BEGIN END; $$; -- follow_arrival DO block --- Sanity checks -SELECT 'Sanity checks commented out!'; -DO $$ -BEGIN -if false then --- Start of follow time is NULL -PERFORM * - FROM clean.follow_arrival - WHERE NOT EXISTS - (SELECT follows.fid - FROM follows - WHERE follows.focal = follow_arrival.fa_fol_b_focal_animid - AND follows.date = follow_arrival.fa_fol_date); -IF FOUND THEN - RAISE data_exception USING - MESSAGE = 'There are follow_arrivals where there is no follow'; -END IF; -end if; -END; -$$; - ANALYZE events , roles , arrivals; diff --git a/conversion/load_follow_arrival_sanity.sql b/conversion/load_follow_arrival_sanity.sql new file mode 100644 index 0000000..d15e3bd --- /dev/null +++ b/conversion/load_follow_arrival_sanity.sql @@ -0,0 +1,125 @@ +-- Copyright (C) 2023, 2024, 2025 The Meme Factory, Inc. +-- http://www.karlpinc.com/ +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU Affero General Public License for more details. +-- +-- You should have received a copy of the GNU Affero General Public License +-- along with this program. If not, see . +-- +-- Karl O. Pinc + +-- Setup the search path +SET search_path TO sokwedb, codes, lib, clean, libconv; + + +-- +-- Follow_arrival sanity checks +-- + +DO $$ +BEGIN +-- Start of follow time is NULL +PERFORM * FROM clean.follow_arrival WHERE fa_time_start IS NULL; +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follow_arrivals where fa_time_start is NULL'; +END IF; + +-- End of follow time is NULL +PERFORM * FROM clean.follow_arrival WHERE fa_time_end IS NULL; +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follow_arrivals where fa_time_end is NULL'; +END IF; + +-- follow_arrival.fa_type_of_nesting has a valid value +PERFORM * + FROM clean.follow_arrival + WHERE (fa_type_of_nesting NOT IN (0, 1, 2, 3) + OR fa_type_of_nesting IS NULL) + AND NOT -- problem #41 + ((fa_fol_date = '1975-05-15' + AND fa_fol_b_focal_animid = 'SW' + AND fa_b_arr_animid = 'SA') + OR (fa_fol_date = '1975-05-15' + AND fa_fol_b_focal_animid = 'SW' + AND fa_b_arr_animid = 'SW') + OR (fa_fol_date = '1982-08-31' + AND fa_fol_b_focal_animid = 'GB' + AND fa_b_arr_animid = 'MM') + OR (fa_fol_date = '1988-10-26' + AND fa_fol_b_focal_animid = 'PF' + AND fa_b_arr_animid = 'GL') + OR (fa_fol_date = '2006-03-14' + AND fa_fol_b_focal_animid = 'SW' + AND fa_b_arr_animid = 'TZN') + OR (fa_fol_date = '2006-03-14' + AND fa_fol_b_focal_animid = 'SW' + AND fa_b_arr_animid = 'TZN') + OR (fa_fol_date = '2006-03-14' + AND fa_fol_b_focal_animid = 'SW' + AND fa_b_arr_animid = 'TZN') + OR (fa_fol_date = '2006-03-14' + AND fa_fol_b_focal_animid = 'SW' + AND fa_b_arr_animid = 'TZN') + OR (fa_fol_date = '2009-01-11' + AND fa_fol_b_focal_animid = 'FO' + AND fa_b_arr_animid = 'TB') + OR (fa_fol_date = '2011-12-26' + AND fa_fol_b_focal_animid = 'GLD' + AND fa_b_arr_animid = 'GA') + OR (fa_fol_date = '2011-12-26' + AND fa_fol_b_focal_animid = 'GLD' + AND fa_b_arr_animid = 'GGL')); +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follow_arrivals where fa_type_of_nesting is not ' + || '0, 1, 2, 3, 4'; +END IF; + +-- follow.fol_flag_begin_in_nest is not 0 or 1 +PERFORM * + FROM clean.follow + WHERE fol_flag_begin_in_nest <> 0 AND fol_flag_begin_in_nest <> 1; +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follows where fol_flag_begin_in_nest is not ' + || '0 or 1'; +END IF; + +-- follow.fol_flag_end_in_nest is not 0 or 1 +PERFORM * + FROM clean.follow + WHERE fol_flag_end_in_nest <> 0 AND fol_flag_end_in_nest <> 1; +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follows where fol_flag_end_in_nest is not ' + || '0 or 1'; +END IF; + +-- The sort order used in load_follow_arrival.m4 generates unique +-- values. Without this check the staged loading might miss or duplicate +-- a row. +PERFORM 1 + FROM clean.follow_arrival + GROUP BY fa_fol_date, fa_fol_b_focal_animid, fa_b_arr_animid, fa_seq_num + HAVING count(*) > 1 + ORDER BY fa_fol_date, fa_fol_b_focal_animid, fa_b_arr_animid, fa_seq_num; +IF FOUND THEN + RAISE data_exception USING + MESSAGE = 'There are follow_arrivals where there are duplicates of' + || ' the combination of:' + || ' fa_fol_date, fa_fol_b_focal_animid,' + || ' fa_b_arr_animid, fa_seq_num'; +END IF; + +END; +$$; -- 2.34.1