From e29b2ef21b2278fcd3aaa055268a4cdc9e361ab0 Mon Sep 17 00:00:00 2001 From: "Karl O. Pinc kop@karlpinc.com" Date: Tue, 5 May 2026 16:06:38 +0000 Subject: [PATCH] Resolve problem #24, and introduce framework conversion data changes This introduces the conv_state schema, where state is saved that records "automatic" changes the conversion makes to the converted data. When the MS Access data is changed and the conversion re-run, the "new" automatic changes can be compared to the changes made to the previous data set. The "compare-state" make target can be used to report differences between the old and new data. Review this report to ensure that the automated "fixes" do not change something that should not be changed. --- conversion/Makefile | 45 ++++++++++++++++++++++++++++++++++-- conversion/clean.sql | 39 ++++++++++++++++++++++++++++++- conversion/compare_state.sh | 36 +++++++++++++++++++++++++++++ conversion/compare_state.sql | 30 ++++++++++++++++++++++++ conversion/copy_state.sql | 33 ++++++++++++++++++++++++++ conversion/createschemas.sql | 4 ++++ 6 files changed, 184 insertions(+), 3 deletions(-) create mode 100644 conversion/compare_state.sh create mode 100644 conversion/compare_state.sql create mode 100644 conversion/copy_state.sql diff --git a/conversion/Makefile b/conversion/Makefile index a41541b..7e9d394 100644 --- a/conversion/Makefile +++ b/conversion/Makefile @@ -66,12 +66,18 @@ PSQL_SETUP: $(GENERIC_DEPENDS) ## CONVERSION_SCHEMA ## The schema into which the dump of the original MS Access ## database is restored. The default is: raw +## +## COMPARE_STATE +## When "Y", the default, compare the state of the +## conversion to the state of the conversion of a previous +## conversion run. (See the state related targets, below.) # # The conversion portion of the build system # CONVERSION_SCHEMA := raw +COMPARE_STATE := Y LIBCONV_SCHEMA = libconv TIDY_SCHEMA := tidy EASY_SCHEMA := easy @@ -87,7 +93,6 @@ PSQL_EASY_SETUP := $(PSQL_SETUP) \ printf 'SET search_path TO $(EASY_SCHEMA);\n' ; PSQL_CLEAN_SETUP := $(PSQL_SETUP) \ printf 'SET search_path TO $(CLEAN_SCHEMA);\n' ; - ## ## The available targets for make (make TARGET) are: ## @@ -256,7 +261,28 @@ make_clean: $(PSQL_DEPENDS) clean.sql ## Complete the conversion steps that follow the ## make_clean step .PHONY: after_make_clean -after_make_clean: load_data +after_make_clean: compare_state after_compare_state + +## compare_state +## Compare the state of clean to the state, +## from a previous conversion run, in conv_state +## An error is raised if they differ and +## COMPARE_STATE=Y, the default. +.PHONY: compare_state +compare_state: $(PSQL_DEPENDS) compare_state.sql + if [ "$(COMPARE_STATE)" = 'Y' ] ; then \ + PSQL_SETUP="$(PSQL_SETUP)" \ + PSQL_ARGS_BASE='$(PSQL_ARGS_BASE)' \ + PSQL_ARGS_MINIMAL=$(PSQL_ARGS_MINIMAL) \ + PSQL_SINGLE_TRANS='$(PSQL_SINGLE_TRANS)' \ + bash compare_state.sh ; \ + fi + +## after_compare_state +## Complete the conversion steps that follow the +## compare_state step +.PHONY: after_compare_state +after_compare_state: load_data ## load_data Copy the data from the clean schema into the ## production schemas. @@ -340,3 +366,18 @@ load_finish: $(PSQL_DEPENDS) ( $(PSQL_SETUP) \ cat load_finish.sql ;) \ | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y + +## copy_state Copy records of conversion state from the +## copy schema to the conv_state schema +## This step is _not_ run as part of the regular +## sequence of conversion steps. +## CAUTION: Any time there is a new data dump, +## the state of the copied rows should be copied to +## the conv_state schema, but only after +## comparision between the old and new set of +## state, and after verification of +## discrepencies. +copy_state: $(PSQL_DEPENDS) copy_state.sql + ( $(PSQL_CLEAN_SETUP) \ + cat copy_state.sql ; ) \ + | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y diff --git a/conversion/clean.sql b/conversion/clean.sql index b73809a..bd6b052 100644 --- a/conversion/clean.sql +++ b/conversion/clean.sql @@ -1,4 +1,5 @@ --- Copyright (C) 2023, 2025 The Meme Factory, Inc. http://www.karlpinc.com/ +-- Copyright (C) 2023, 2025-2026 The Meme Factory, Inc. +-- http://www.karlpinc.com/ -- -- This program is free software: you can redistribute it and/or modify -- it under the terms of the GNU Affero General Public License as @@ -286,6 +287,42 @@ UPDATE follow_arrival SET fa_data_source = 'Tiki' WHERE fa_data_source = 'TikI'; + +-- Problem #24 Follow starts are not first arrivals +-- There are cases where the old FOLLOW table's columns +-- "fol_time_begin" is not the first arrival time, fa_time_start, on +-- FOLLOW_ARRIVAL. + +-- Save a record of what the discrepencies look like +WITH spans AS + (SELECT fa_fol_date, fa_fol_b_focal_animid + , MIN(fa_time_start) AS min_start + FROM clean.follow_arrival + WHERE fa_b_arr_animid = fa_fol_b_focal_animid + GROUP BY fa_fol_date, fa_fol_b_focal_animid) +SELECT spans.*, follow.fol_time_begin + INTO problem_24 + FROM spans + JOIN clean.follow + ON (follow.fol_date = spans.fa_fol_date + AND follow.fol_b_animid = spans.fa_fol_b_focal_animid) + WHERE NOT EXISTS + (SELECT 1 + FROM clean.follow + WHERE follow.fol_date = spans.fa_fol_date + AND follow.fol_b_animid = spans.fa_fol_b_focal_animid + AND spans.min_start = follow.fol_time_begin); +GRANT SELECT ON problem_24 TO reader, writer; + +-- "Fix" the follow start time +UPDATE follow + SET fol_time_begin = problem_24.min_start + FROM problem_24 + WHERE follow.fol_date = problem_24.fa_fol_date + AND follow.fol_b_animid = problem_24.fa_fol_b_focal_animid + AND follow.fol_time_begin = problem_24.fol_time_begin; + + -- Problems #26 and #27 -- There can be a mis-match between the first follow_arrival row for the -- focal and the follow row, regarding whether the focal started the diff --git a/conversion/compare_state.sh b/conversion/compare_state.sh new file mode 100644 index 0000000..4f60a75 --- /dev/null +++ b/conversion/compare_state.sh @@ -0,0 +1,36 @@ +# Copyright (C) 2026 The Meme Factory, Inc. http://www.karlpinc.com/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Diff the problem state history tables +# +# Relies on the following environment variables: +# PSQL_CLEAN_SETUP +# PSQL_CONV_STATE_SETUP +# PSQL_ARGS_BASE +# PSQL_ARGS_MINIMAL +# PSQL_SINGLE_TRANS +# +# Karl O. Pinc +diff -u \ + <(( ${PSQL_SETUP} ; \ + echo 'set search_path=clean;' ; \ + cat compare_state.sql ; ) \ + | psql ${PSQL_ARGS_BASE} "${PSQL_ARGS_MINIMAL}" ${PSQL_SINGLE_TRANS} \ + --set=ON_ERROR_STOP=y ; ) \ + <(( ${PSQL_SETUP} ; \ + echo 'set search_path=conv_state;' ; \ + cat compare_state.sql ; ) \ + | psql ${PSQL_ARGS_BASE} "${PSQL_ARGS_MINIMAL}" ${PSQL_SINGLE_TRANS} \ + --set=ON_ERROR_STOP=y ; ) diff --git a/conversion/compare_state.sql b/conversion/compare_state.sql new file mode 100644 index 0000000..be87a2d --- /dev/null +++ b/conversion/compare_state.sql @@ -0,0 +1,30 @@ +-- Copyright (C) 2026 The Meme Factory, Inc. http://www.karlpinc.com/ +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU Affero General Public License for more details. +-- +-- You should have received a copy of the GNU Affero General Public License +-- along with this program. If not, see . +-- +-- Output the problem state history tables, of the current search_path. +-- +-- Karl O. Pinc +-- +-- Remarks: +-- The search_path is set by the caller. + +SELECT 'Problem #24, Follow starts are not first arrivals'; + +SELECT * + FROM problem_24 + ORDER BY problem_24.fa_fol_date + , problem_24.fa_fol_b_focal_animid + , problem_24.fol_time_begin + , problem_24.min_start; diff --git a/conversion/copy_state.sql b/conversion/copy_state.sql new file mode 100644 index 0000000..74adbea --- /dev/null +++ b/conversion/copy_state.sql @@ -0,0 +1,33 @@ +-- Copyright (C) 2026 The Meme Factory, Inc. http://www.karlpinc.com/ +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU Affero General Public License for more details. +-- +-- You should have received a copy of the GNU Affero General Public License +-- along with this program. If not, see . +-- +-- Copy data that is not to be loaded into the production schemas, from the +-- clean schema to the rejects schema. +-- +-- Karl O. Pinc +-- +-- Remarks: +-- The search_path is the clean schema. + +-- +-- Problem #24 +-- + +DROP TABLE IF EXISTS conv_state.problem_24; + +CREATE TABLE conv_state.problem_24 AS + SELECT * FROM problem_24; + +GRANT SELECT ON conv_state.problem_24 TO reader, writer; diff --git a/conversion/createschemas.sql b/conversion/createschemas.sql index 0e445dd..3883694 100644 --- a/conversion/createschemas.sql +++ b/conversion/createschemas.sql @@ -34,3 +34,7 @@ GRANT USAGE ON SCHEMA easy TO reader, writer; -- clean: Like "easy", but with some cleanup of data CREATE SCHEMA IF NOT EXISTS clean; GRANT USAGE ON SCHEMA clean TO reader, writer; + +-- conv_state: Copy of tables in the clean schema that are +CREATE SCHEMA IF NOT EXISTS conv_state; +GRANT USAGE ON SCHEMA conv_state TO reader, writer; -- 2.34.1