From 4497b0836f3a4d73ae25c95d7d3932d3d6d83918 Mon Sep 17 00:00:00 2001 From: "Karl O. Pinc" Date: Thu, 30 Nov 2023 15:31:11 -0600 Subject: [PATCH] Progress towards moving demography data into a production-like structure --- conversion/Makefile | 30 +++++++- conversion/clean.sql | 42 +++++++++++ conversion/createschemas.sql | 4 + conversion/load_data.sql | 138 +++++++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 conversion/clean.sql create mode 100644 conversion/load_data.sql diff --git a/conversion/Makefile b/conversion/Makefile index a68ce5e..88529a4 100644 --- a/conversion/Makefile +++ b/conversion/Makefile @@ -75,6 +75,7 @@ CONVERSION_SCHEMA := raw LIBCONV_SCHEMA = libconv TIDY_SCHEMA := tidy EASY_SCHEMA := easy +CLEAN_SCHEMA := clean PSQL_RAW_SETUP := $(PSQL_SETUP) \ printf 'SET search_path TO $(CONVERSION_SCHEMA);\n' ; PSQL_LIBCONV_SETUP := $(PSQL_SETUP) \ @@ -84,6 +85,8 @@ PSQL_TIDY_SETUP := $(PSQL_SETUP) \ $(TIDY_SCHEMA) $(LIBCONV_SCHEMA) ; PSQL_EASY_SETUP := $(PSQL_SETUP) \ printf 'SET search_path TO $(EASY_SCHEMA);\n' ; +PSQL_CLEAN_SETUP := $(PSQL_SETUP) \ + printf 'SET search_path TO $(CLEAN_SCHEMA);\n' ; ## ## The available targets for make (make TARGET) are: @@ -94,7 +97,7 @@ PSQL_EASY_SETUP := $(PSQL_SETUP) \ ## the converted data in the TARGET_DB. .PHONY: convert convert: create-schema restore grant_permissions copy_tidy make_tidy \ - copy_easy make_easy + copy_easy make_easy copy_clean make_clean load_data ## clean Delete all user-generated files .PHONY: clean @@ -176,3 +179,28 @@ make_easy: $(PSQL_DEPENDS) clean_names.sql ( $(PSQL_EASY_SETUP) \ cat clean_names.sql ; ) \ | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y + +## copy_clean Copy the easy schema into the clean schema +.PHONY: copy_clean +copy_clean: $(PSQL_DEPENDS) + # Copy the easy schema to the clean schema + PSQL_SETUP="$(PSQL_SETUP)" \ + PSQL_ARGS="$(PSQL_ARGS)" \ + PSQL_ARGS_MINIMAL="$(PSQL_ARGS_MINIMAL)" \ + PSQL_SINGLE_TRANS="$(PSQL_SINGLE_TRANS)" \ + ./copy_schema.sh easy clean + +## make_clean Do some data cleanup in the clean schema +.PHONY: make_clean +make_clean: $(PSQL_DEPENDS) clean.sql + ( $(PSQL_CLEAN_SETUP) \ + cat clean.sql ; ) \ + | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y + +## load_data Copy the data from the easy schema into the +## production schemas. +.PHONY: load_data +load_data: $(PSQL_DEPENDS) + ( $(PSQL_SETUP) \ + cat load_data.sql ;) \ + | psql $(PSQL_ARGS) $(PSQL_SINGLE_TRANS) --set=ON_ERROR_STOP=y diff --git a/conversion/clean.sql b/conversion/clean.sql new file mode 100644 index 0000000..3c40b2d --- /dev/null +++ b/conversion/clean.sql @@ -0,0 +1,42 @@ +-- Copyright (C) 2023 The Meme Factory, Inc. http://www.karlpinc.com/ +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU Affero General Public License for more details. +-- +-- You should have received a copy of the GNU Affero General Public License +-- along with this program. If not, see . +-- +-- Cleanup the data, somewhat, in the "clean" schema. +-- +-- Karl O. Pinc + +-- BIOGRAPHY + +-- (#6) BIOGRAPHY.B_AnimID_num column contains the empty string +-- Change '' to NULL +UPDATE biography + SET b_animid_num = NULL + WHERE b_animid_num = ''; + +-- (#7) BIOGRAPHY.B_AnimID_num column is textual +-- Get rid of leading "CH" +UPDATE biography + SET b_animid_num = SUBSTRING(b_animid_num FROM 3) + WHERE UPPER(SUBSTRING(b_animid_num FOR 2)) = 'CH'; +-- Change data type to INT +ALTER TABLE biography + ALTER b_animid_num TYPE INT + USING b_animid_num::INTEGER; + +-- (#8) BIOGRAPHY.DadID_publication_info column contains NULL values +-- Change NULL to '' +UPDATE biography + SET b_dadid_publication_info = '' + WHERE b_dadid_publication_info IS NULL; diff --git a/conversion/createschemas.sql b/conversion/createschemas.sql index 906b8af..0e445dd 100644 --- a/conversion/createschemas.sql +++ b/conversion/createschemas.sql @@ -30,3 +30,7 @@ GRANT USAGE ON SCHEMA tidy TO reader, writer; -- easy: Like "tidy", but with table names that don't need quoting CREATE SCHEMA IF NOT EXISTS easy; GRANT USAGE ON SCHEMA easy TO reader, writer; + +-- clean: Like "easy", but with some cleanup of data +CREATE SCHEMA IF NOT EXISTS clean; +GRANT USAGE ON SCHEMA clean TO reader, writer; diff --git a/conversion/load_data.sql b/conversion/load_data.sql new file mode 100644 index 0000000..191bf9e --- /dev/null +++ b/conversion/load_data.sql @@ -0,0 +1,138 @@ +-- Copyright (C) 2023 The Meme Factory, Inc. http://www.karlpinc.com/ +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU Affero General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU Affero General Public License for more details. +-- +-- You should have received a copy of the GNU Affero General Public License +-- along with this program. If not, see . +-- +-- Karl O. Pinc + +-- Setup the search path +SET search_path TO sokwedb, lookup, lib, clean, libconv; + + +-- Support tables + + +-- community_lookup -> comm_ids +INSERT INTO comm_ids (commid, name, notes, membcriteria) + SELECT community_lookup.cl_community_id + , community_lookup.cl_community_name + , community_lookup.cl_details + , '' + FROM community_lookup; + +-- departtypes +INSERT INTO departtypes (departtype, description) + VALUES ('D', 'Death') + , ('E', 'Emigration') + , ('P', 'Permanent disappearance') + , ('O', 'End of observation; Present in the most recent census'); + +-- entrytypes +INSERT INTO entrytypes (entrytype, description) + VALUES ('B', 'Birth') + , ('I', 'Immigration') + , ('C', 'Start of confirmed AnimID') + , ('O', 'Initiation of close observation'); + +-- people +INSERT INTO people (person, name, description, active) + VALUES ('EVL', 'EVL', '', true) + , ('JR', 'JR', '', true) + , ('KW', 'KW', '', true) + , ('SF', 'SF', '', true) + , ('ICG', 'ICG', '', true); + + +-- Data tables + +-- biography -> biography +INSERT INTO biography( + animid + ,animidnum + ,animname + ,birthcomm + ,bccertainty + ,sex + ,momid + ,dadid + ,dadidpub + ,firstborn + ,birthdate + ,bdmin + ,bdmax + ,bddist + ,entrydate + ,entrytype + ,departdate + ,departtype) + SELECT + b_animid + , b_animid_num::INT + , b_animname + , b_birthgroup + , b_bgcertainty + , b_sex + , b_momid + , b_dadid + , b_dadid_publication_info + , b_firstborn + , b_birthdate + , b_bdmin + , b_bdmax + , b_bddist + , b_entrydate + , b_entrytype + , b_departdate + -- Discard b_departdateerror + , b_departtype + FROM clean.biography; + + +-- community_membership -> comm_membs +INSERT INTO comm_membs ( + animid + ,startdate + ,enddate + ,commid + ,startsource + ,endsource) + SELECT + cm_b_animid + , cm_start_date + , cm_end_date + , cm_cl_community_id + , cm_start_source + , cm_end_source + FROM community_membership + -- Get a consistent ID generation + ORDER BY cm_start_date, cm_end_date, cm_b_animid, cm_cl_community_id; + + +-- community_membership_update_log -> comm_memb_log +INSERT INTO comm_memb_log ( + dateofupdate + ,animid + ,description + ,rationale + ,madeby) + SELECT + date_of_update + , chimp_id + , update_description + , update_rationale + , made_by + FROM community_membership_update_log + -- Get a consistent id generation + ORDER BY date_of_update, chimp_id, update_rationale, made_by; + + -- 2.34.1