Update code and data

This commit is contained in:
Regina Obe
2026-01-17 18:57:46 -05:00
parent 5ba13f6470
commit 85c6b50be2
16 changed files with 245624 additions and 239 deletions

View File

@@ -1,12 +0,0 @@
\a
\t
\cd /temp
\g create_script.sql
CREATE SCHEMA IF NOT EXISTS staging;
SELECT 'CREATE TABLE staging.factfinder_import(geo_id varchar(255)
, geo_id2 varchar(255), geo_display varchar(255)
, '|| array_to_string(array_agg('s' || lpad(i::text,2, '0')
|| ' varchar(255), s' || lpad(i::text,2, '0') || '_perc varchar(255) ' ), ',') || ');' As create_sql
FROM generate_series(1,51) As i
\g create_script.sql
\i create_script.sql

19
ch04/build_stats.psql Normal file
View File

@@ -0,0 +1,19 @@
\a
\t
\cd /tmp
\g create_script.sql
DROP TABLE IF EXISTS table_stats;
SELECT
'CREATE TABLE table_stats (
table_name varchar(255), count bigint);';
SELECT format(''INSERT INTO TABLE staging.table_stats(
table_name, count
)
VALUES (%1$I.%2$I,
(SELECT COUNT(1)
FROM %1$I.%2$I
) )', table_schema, table_name)
FROM information_schema.tables
WHERE table_schema = 'pg_catalog';
\o
\i create_script.sql

View File

@@ -1,29 +1,64 @@
\connect postgresql_book
\cd /postgresql_up_and_running_4e_code_data/raw/ACSST1Y2024.S2502
\cd /postgresql_up_and_running_4e_code_data/raw/ACS.S2502
CREATE SCHEMA IF NOT EXISTS staging;
DROP TABLE IF EXISTS staging.lu_acs_columns;
CREATE TABLE staging.lu_acs_columns(column_name text, label text);
\copy staging.lu_acs_columns FROM ACSST1Y2024.S2502-Column-Metadata.csv CSV HEADER
\copy staging.lu_acs_columns FROM ACSST5Y2023.S2502-Column-Metadata.csv CSV HEADER
INSERT INTO census.lu_acs_types(id, label)
SELECT column_name, label
FROM staging.lu_acs_columns
WHERE column_name NOT IN(SELECT a.id FROM census.lu_acs_types AS a);
DROP TABLE IF EXISTS staging.acs_data_raw;
-- <4>
SELECT 'CREATE TABLE staging.acs_data_raw('
|| string_agg( quote_ident( lower(column_name) )
|| string_agg( quote_ident(column_name )
|| ' text', ',') || ');'
FROM staging.lu_acs_columns
\gexec
ALTER TABLE staging.acs_data_raw ADD COLUMN unknown text;
\gexec <5>
\copy staging.acs_data_raw FROM ACSST1Y2024.S2502-Data.csv WITH (format 'csv', HEADER)
ALTER TABLE staging.acs_data_raw ADD COLUMN unknown text; -- <6>
CREATE SCHEMA IF NOT EXISTS census;
DROP TABLE IF EXISTS census.acs_data;
SELECT 'CREATE TABLE census.acs_data AS
SELECT '
|| string_agg( 'NULLIF(' || quote_ident( lower(column_name) ) || ', ''N'') '
|| CASE WHEN column_name IN('GEO_ID', 'NAME') THEN '::text' ELSE '::numeric' END
|| ' AS '
|| quote_ident( lower(column_name) ) , ',' ) || '
FROM staging.acs_data_raw
WHERE geo_id <> ''Geography'';'
-- 8 to 9 secs, 85396 rows
\copy staging.acs_data_raw FROM ACSST5Y2023.S2502-Data.csv WITH (format 'csv', HEADER)
-- <7> takes 14 - 24 minutes
INSERT INTO census.acs_facts(acs_type_id, tract_id, yr, val)
SELECT kv.key AS acs_type_id, r."GEO_ID" AS tract_id, 2023 AS yr,
kv.value::numeric AS val
FROM staging.acs_data_raw AS r,
jsonb_each_text(to_jsonb(r)) AS kv
WHERE kv.value ~ '^[0-9\.\-]+$';
DROP TABLE IF EXISTS staging.lu_acs_columns;
CREATE TABLE staging.lu_acs_columns(column_name text, label text);
\copy staging.lu_acs_columns FROM ACSST5Y2020.S2502-Column-Metadata.csv CSV HEADER
INSERT INTO census.lu_acs_types(id, label)
SELECT column_name, label
FROM staging.lu_acs_columns
WHERE column_name NOT IN(SELECT a.id FROM census.lu_acs_types AS a);
DROP TABLE IF EXISTS staging.acs_data_raw;
-- <4>
SELECT 'CREATE TABLE staging.acs_data_raw('
|| string_agg( quote_ident(column_name )
|| ' text', ',') || ');'
FROM staging.lu_acs_columns
\gexec <5>
ALTER TABLE staging.acs_data_raw ADD COLUMN unknown text; -- <6>
\copy staging.acs_data_raw FROM ACSST5Y2020.S2502-Data.csv WITH (format 'csv', HEADER)
-- <7> takes 14 - 24 minutes
SELECT 'INSERT INTO census.acs_facts(acs_type_id, tract_id, yr, val)
SELECT ' || quote_literal(column_name )
|| ', r."GEO_ID" AS tract_id, 2020 AS yr, '
|| ' r.' || quote_ident(column_name ) || '::numeric AS val
FROM staging.acs_data_raw AS r
WHERE r.' || quote_ident(column_name ) || ' ~ ''^[0-9\.\-]+'';'
FROM staging.lu_acs_columns
WHERE column_name NOT IN('GEO_ID', 'NAME')
\gexec