Skip to content

Instantly share code, notes, and snippets.

@cabecada
Created October 5, 2024 17:56
Show Gist options
  • Save cabecada/51c86b7c2cd2a97ebb9f15d592f3cfb7 to your computer and use it in GitHub Desktop.
Save cabecada/51c86b7c2cd2a97ebb9f15d592f3cfb7 to your computer and use it in GitHub Desktop.
demo of pg block corruption and tracing the problem block
postgres@ubuntu:/tmp$ psql -p 5432
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
postgres=# create database demo;
CREATE DATABASE
postgres=# \c demo
You are now connected to database "demo" as user "postgres".
demo=# select * from pg_database;
demo=# select oid,datname from pg_database;
oid | datname
-------+-----------
5 | postgres
16388 | demo
1 | template1
4 | template0
(4 rows)
demo=# -- 16388 demo is our db;
demo=# create table t(col1 int primary key, col2 text);
CREATE TABLE
demo=# insert into t select x, md5(x::text) from generate_series(1, 10000000) x;
INSERT 0 10000000
demo=# checkpoint;
CHECKPOINT
demo=# select pg_switch_wal();
pg_switch_wal
---------------
0/642F6FD0
(1 row)
demo=#
\q
postgres@ubuntu:/tmp$ p1 stop # just to ensure data written to disk
waiting for server to shut down.... done
server stopped
postgres@ubuntu:/tmp$ p1 start
waiting for server to start.... done
server started
postgres@ubuntu:/tmp$ psql -p 5432 demo
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
demo=# select pg_relation_filepath('t'); -- find the path on disk for this table
pg_relation_filepath
----------------------
base/16388/16389
(1 row)
demo=#
\q
postgres@ubuntu:/tmp$ ls db1/base/16388/16389*
db1/base/16388/16389 db1/base/16388/16389_fsm db1/base/16388/16389_vm
postgres@ubuntu:/tmp$ -- there are three file linked to this object one is datafile, one is free space map , one is visibility map
--: command not found
postgres@ubuntu:/tmp$ #we should add more data to have more files
postgres@ubuntu:/tmp$ psql -p 5432 demo
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
demo=# truncate table t;
TRUNCATE TABLE
demo=# insert into t select x, md5(x::text) from generate_series(1, 100000000) x;
INSERT 0 100000000
demo=# select pg_relation_filepath('t'); -- find the path on disk for this table
pg_relation_filepath
----------------------
base/16388/16397
(1 row)
demo=# -- note the change of file on disk, coz we ran truncate earlier which rewrites the file on disk;
demo=#
\q
postgres@ubuntu:/tmp$ ls db1/base/16388/16397*
db1/base/16388/16397 db1/base/16388/16397.2 db1/base/16388/16397.4 db1/base/16388/16397.6 db1/base/16388/16397_vm
db1/base/16388/16397.1 db1/base/16388/16397.3 db1/base/16388/16397.5 db1/base/16388/16397_fsm
postgres@ubuntu:/tmp$ #good , so we now have 7 x 1GB files of table t on disk
postgres@ubuntu:/tmp$ ls -l db1/base/16388/16397*
-rw------- 1 postgres postgres 1073741824 Oct 5 17:30 db1/base/16388/16397
-rw------- 1 postgres postgres 1073741824 Oct 5 17:31 db1/base/16388/16397.1
-rw------- 1 postgres postgres 1073741824 Oct 5 17:27 db1/base/16388/16397.2
-rw------- 1 postgres postgres 1073741824 Oct 5 17:27 db1/base/16388/16397.3
-rw------- 1 postgres postgres 1073741824 Oct 5 17:28 db1/base/16388/16397.4
-rw------- 1 postgres postgres 1073741824 Oct 5 17:29 db1/base/16388/16397.5
-rw------- 1 postgres postgres 384221184 Oct 5 17:30 db1/base/16388/16397.6
-rw------- 1 postgres postgres 1695744 Oct 5 17:29 db1/base/16388/16397_fsm
-rw------- 1 postgres postgres 40960 Oct 5 17:30 db1/base/16388/16397_vm
postgres@ubuntu:/tmp$ #the block size is 8k, so how many blocks in each chunk of 1gb. 1gb/8k = 131072 blocks
postgres@ubuntu:/tmp$ #lets write random data to block 10 of size 8k on chunk db1/base/16388/16397.1
postgres@ubuntu:/tmp$ #dd if=/dev/urandom seek=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1
postgres@ubuntu:/tmp$ p1 stop
waiting for server to shut down.... done
server stopped
postgres@ubuntu:/tmp$ p1 start
waiting for server to start.... done
server started
postgres@ubuntu:/tmp$ #just check all good till now before we corrupt block
postgres@ubuntu:/tmp$ psql -p 5433 demo
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
demo=# select * from t where col1 = -1;
col1 | col2
------+------
(0 rows)
demo=# set enable_indexscan to off;
SET
demo=# set enable_indexonlyscan to off;
SET
demo=# select * from t where col1 = -1;
col1 | col2
------+------
(0 rows)
demo=#
\q
postgres@ubuntu:/tmp$ p1 stop
waiting for server to shut down.... done
server stopped
postgres@ubuntu:/tmp$ dd if=/dev/urandom seek=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1
1+0 records in
1+0 records out
8192 bytes (8.2 kB, 8.0 KiB) copied, 0.0001743 s, 47.0 MB/s
postgres@ubuntu:/tmp$ #now that we have corrupted the block on db1/base/16388/16397.1 we try to figure out problems in query
postgres@ubuntu:/tmp$ p1 start
waiting for server to start.... done
server started
postgres@ubuntu:/tmp$ psql -p 5432 demo
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
demo=# select * from t where col1 = -1;
col1 | col2
------+------
(0 rows)
demo=# explain analyze select * from t where col1 = -1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------
Index Scan using t_pkey on t (cost=0.57..8.59 rows=1 width=37) (actual time=0.025..0.025 rows=0 loops=1)
Index Cond: (col1 = '-1'::integer)
Planning Time: 0.055 ms
Execution Time: 0.046 ms
(4 rows)
demo=# -- coz index used and we corrupted table page, so we disable index scan for now
demo=# ;
demo=# set enable_indexscan to off;
SET
demo=# set enable_indexonlyscan to off; -- this will force seq scan ;
SET
demo=# explain analyze select * from t where col1 = -1;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on t (cost=4.58..8.59 rows=1 width=37) (actual time=0.015..0.015 rows=0 loops=1)
Recheck Cond: (col1 = '-1'::integer)
-> Bitmap Index Scan on t_pkey (cost=0.00..4.58 rows=1 width=0) (actual time=0.013..0.013 rows=0 loops=1)
Index Cond: (col1 = '-1'::integer)
Planning Time: 0.053 ms
Execution Time: 0.048 ms
(6 rows)
demo=# set enable_bitmapscan to off;
SET
demo=# explain analyze select * from t where col1 = -1;
WARNING: page verification failed, calculated checksum 61673 but expected 6429
ERROR: invalid page in block 131082 of relation base/16388/16397
CONTEXT: parallel worker
demo=# -- since we had datachecksums enabled, we got the error which block on which object is having error
demo=# -- but remember 1 chunk of 1gb file only has 131072 blocks, but the error is on block 131082 which is 10 blocks past chunk 1. so it means on chunk 2 i.e db1/base/16388/16397.1 block 10 which is what we expect the problem to be at
demo=# -- lets query replica and see what we can do
demo=#
\q
postgres@ubuntu:/tmp$ psql -p 5433 demo
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
demo=# set enable_indexscan to off;
SET
demo=# set enable_indexonlyscan to off;
SET
demo=# set enable_bitmapscan to off;
SET
demo=# explain analyze select * from t where col1 = -1; QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..1355167.85 rows=1 width=37) (actual time=6078.972..6083.270 rows=0 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Parallel Seq Scan on t (cost=0.00..1354167.75 rows=1 width=37) (actual time=6065.403..6065.403 rows=0 loops=3)
Filter: (col1 = '-1'::integer)
Rows Removed by Filter: 33333333
Planning Time: 0.554 ms
JIT:
Functions: 6
Options: Inlining true, Optimization true, Expressions true, Deforming true
Timing: Generation 0.329 ms, Inlining 136.889 ms, Optimization 7.492 ms, Emission 10.393 ms, Total 155.102 ms
Execution Time: 6094.496 ms
(12 rows)
demo=# -- great no errors since we only corrupted the block of primary and not replica.
demo=# -- so we should failover and rebuild the replica, but to tinker around, i will copy data from replica and rebuild indexes on primary for fun
demo=# select pg_relation_filepath('t'); -- find the path on disk for this table pg_relation_filepath
----------------------
base/16388/16397
(1 row)
demo=#
\q
postgres@ubuntu:/tmp$ p1 stop
waiting for server to shut down.... done
server stopped
postgres@ubuntu:/tmp$ p2 stop
waiting for server to shut down.... done
server stopped
postgres@ubuntu:/tmp$ #note the checksums of only chunk 2 differs
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397
3599534708 1073741824 db1/base/16388/16397
3599534708 1073741824 db2/base/16388/16397
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397.1
3722867617 1073741824 db1/base/16388/16397.1
3638819912 1073741824 db2/base/16388/16397.1
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397.2
2189200680 1073741824 db1/base/16388/16397.2
2189200680 1073741824 db2/base/16388/16397.2
postgres@ubuntu:/tmp$ #so for fun i can either copy the whole 1gb file, but i will only copy the problem block
postgres@ubuntu:/tmp$ dd if=db2/base/16388/16397.1 seek=10 skip=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1
1+0 records in
1+0 records out
8192 bytes (8.2 kB, 8.0 KiB) copied, 5.12e-05 s, 160 MB/s
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397.1
3638819912 1073741824 db1/base/16388/16397.1
3638819912 1073741824 db2/base/16388/16397.1
postgres@ubuntu:/tmp$ #note the checksums match and original checksum of db2 chunk 2 untampered
postgres@ubuntu:/tmp$ p1 start
waiting for server to start.... done
server started
postgres@ubuntu:/tmp$ p2 start
waiting for server to start.... done
server started
postgres@ubuntu:/tmp$ psql -p 5432 demo
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1))
Type "help" for help.
demo=# set enable_bitmapscan to off;
SET
demo=# set enable_indexonlyscan to off;
SET
demo=# set enable_indexscan to off;
SET
demo=# -- lets try again;
demo=# explain analyze select * from t where col1 = -1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..1355167.85 rows=1 width=37) (actual time=5173.036..5176.222 rows=0 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Parallel Seq Scan on t (cost=0.00..1354167.75 rows=1 width=37) (actual time=5161.846..5161.846 rows=0 loops=3)
Filter: (col1 = '-1'::integer)
Rows Removed by Filter: 33333333
Planning Time: 3.577 ms
JIT:
Functions: 6
Options: Inlining true, Optimization true, Expressions true, Deforming true
Timing: Generation 0.546 ms, Inlining 138.306 ms, Optimization 7.388 ms, Emission 9.805 ms, Total 156.045 ms
Execution Time: 5193.682 ms
(12 rows)
demo=# -- coool we recovered the problem block by copying from replica
demo=#
\q
postgres@ubuntu:/tmp$ p1 stop
waiting for server to shut down.... done
server stopped
postgres@ubuntu:/tmp$ p2 stop
waiting for server to shut down.... done
server stopped
@cabecada
Copy link
Author

cabecada commented Oct 5, 2024

to check the contents of a file without the db, we can make use of pg_filedump and see what data we can recover, and what we cannot.
here we try to get the contents of block 10 on chunk2. with and without corruption

postgres@ubuntu:/tmp$ pg_filedump -R 10 10 -D int,text db1/base/16388/16397.1 | grep COPY
COPY: 15729841  955c4ed410c40e3ee8debe2c003aa126
COPY: 15729842  56074a530bf0b1ef2d8eb2745877f3d3
COPY: 15729843  c0bbb62510ed4d03d24d98ca310709cc
COPY: 15729844  5dd6505dfb3f22b407465039bf60d944
COPY: 15729845  c1cf9d4c9932cbd53982f209249a2ffc
COPY: 15729846  81fc0cd00599d5f11e1d69d7aa0aa38c
COPY: 15729847  403965e7be49aecc3850dc3021d09d72
COPY: 15729848  6c1810bcb97d4d2bef17c99a987149dc
COPY: 15729849  fcc6f10eea12ece09205c0baabb3fcd3
COPY: 15729850  80bb4f64fcb6fa9e036f809fb00bfe6d
COPY: 15729851  36db60a769cfd14e58b31b8c4baeaad0
COPY: 15729852  f96b2d8d739ea2e784c65178a9db0937
COPY: 15729853  83eccae6ba8096697e7bfa7866402fdd
COPY: 15729960  39dc538eeb99bdec492dfad62423b51a
...
postgres@ubuntu:/tmp$ pg_filedump -R 10 10 -D int,text db1/base/16388/16397.1 | grep COPY | head -5
COPY: 15729841  955c4ed410c40e3ee8debe2c003aa126
COPY: 15729842  56074a530bf0b1ef2d8eb2745877f3d3
COPY: 15729843  c0bbb62510ed4d03d24d98ca310709cc
COPY: 15729844  5dd6505dfb3f22b407465039bf60d944
COPY: 15729845  c1cf9d4c9932cbd53982f209249a2ffc
postgres@ubuntu:/tmp$ pg_filedump -R 10 10 -D int,text db2/base/16388/16397.1 | grep COPY | head -5
COPY: 15729841  955c4ed410c40e3ee8debe2c003aa126
COPY: 15729842  56074a530bf0b1ef2d8eb2745877f3d3
COPY: 15729843  c0bbb62510ed4d03d24d98ca310709cc
COPY: 15729844  5dd6505dfb3f22b407465039bf60d944
COPY: 15729845  c1cf9d4c9932cbd53982f209249a2ffc
postgres@ubuntu:/tmp$ #now we corrupt using dd
postgres@ubuntu:/tmp$ dd if=/dev/urandom seek=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1
1+0 records in
1+0 records out
8192 bytes (8.2 kB, 8.0 KiB) copied, 9.7901e-05 s, 83.7 MB/s
postgres@ubuntu:/tmp$ pg_filedump -R 10 10 -D int,text db1/base/16388/16397.1 | grep COPY | head -5
postgres@ubuntu:/tmp$ #filedump could not fetch data from file on disk on db1, but db2 is ok since it is not corrupted
postgres@ubuntu:/tmp$ pg_filedump -R 10 10 -D int,text db2/base/16388/16397.1 | grep COPY | head -5
COPY: 15729841  955c4ed410c40e3ee8debe2c003aa126
COPY: 15729842  56074a530bf0b1ef2d8eb2745877f3d3
COPY: 15729843  c0bbb62510ed4d03d24d98ca310709cc
COPY: 15729844  5dd6505dfb3f22b407465039bf60d944
COPY: 15729845  c1cf9d4c9932cbd53982f209249a2ffc

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment