Created
October 5, 2024 17:56
-
-
Save cabecada/51c86b7c2cd2a97ebb9f15d592f3cfb7 to your computer and use it in GitHub Desktop.
demo of pg block corruption and tracing the problem block
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
postgres@ubuntu:/tmp$ psql -p 5432 | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
postgres=# create database demo; | |
CREATE DATABASE | |
postgres=# \c demo | |
You are now connected to database "demo" as user "postgres". | |
demo=# select * from pg_database; | |
demo=# select oid,datname from pg_database; | |
oid | datname | |
-------+----------- | |
5 | postgres | |
16388 | demo | |
1 | template1 | |
4 | template0 | |
(4 rows) | |
demo=# -- 16388 demo is our db; | |
demo=# create table t(col1 int primary key, col2 text); | |
CREATE TABLE | |
demo=# insert into t select x, md5(x::text) from generate_series(1, 10000000) x; | |
INSERT 0 10000000 | |
demo=# checkpoint; | |
CHECKPOINT | |
demo=# select pg_switch_wal(); | |
pg_switch_wal | |
--------------- | |
0/642F6FD0 | |
(1 row) | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ p1 stop # just to ensure data written to disk | |
waiting for server to shut down.... done | |
server stopped | |
postgres@ubuntu:/tmp$ p1 start | |
waiting for server to start.... done | |
server started | |
postgres@ubuntu:/tmp$ psql -p 5432 demo | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
demo=# select pg_relation_filepath('t'); -- find the path on disk for this table | |
pg_relation_filepath | |
---------------------- | |
base/16388/16389 | |
(1 row) | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ ls db1/base/16388/16389* | |
db1/base/16388/16389 db1/base/16388/16389_fsm db1/base/16388/16389_vm | |
postgres@ubuntu:/tmp$ -- there are three file linked to this object one is datafile, one is free space map , one is visibility map | |
--: command not found | |
postgres@ubuntu:/tmp$ #we should add more data to have more files | |
postgres@ubuntu:/tmp$ psql -p 5432 demo | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
demo=# truncate table t; | |
TRUNCATE TABLE | |
demo=# insert into t select x, md5(x::text) from generate_series(1, 100000000) x; | |
INSERT 0 100000000 | |
demo=# select pg_relation_filepath('t'); -- find the path on disk for this table | |
pg_relation_filepath | |
---------------------- | |
base/16388/16397 | |
(1 row) | |
demo=# -- note the change of file on disk, coz we ran truncate earlier which rewrites the file on disk; | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ ls db1/base/16388/16397* | |
db1/base/16388/16397 db1/base/16388/16397.2 db1/base/16388/16397.4 db1/base/16388/16397.6 db1/base/16388/16397_vm | |
db1/base/16388/16397.1 db1/base/16388/16397.3 db1/base/16388/16397.5 db1/base/16388/16397_fsm | |
postgres@ubuntu:/tmp$ #good , so we now have 7 x 1GB files of table t on disk | |
postgres@ubuntu:/tmp$ ls -l db1/base/16388/16397* | |
-rw------- 1 postgres postgres 1073741824 Oct 5 17:30 db1/base/16388/16397 | |
-rw------- 1 postgres postgres 1073741824 Oct 5 17:31 db1/base/16388/16397.1 | |
-rw------- 1 postgres postgres 1073741824 Oct 5 17:27 db1/base/16388/16397.2 | |
-rw------- 1 postgres postgres 1073741824 Oct 5 17:27 db1/base/16388/16397.3 | |
-rw------- 1 postgres postgres 1073741824 Oct 5 17:28 db1/base/16388/16397.4 | |
-rw------- 1 postgres postgres 1073741824 Oct 5 17:29 db1/base/16388/16397.5 | |
-rw------- 1 postgres postgres 384221184 Oct 5 17:30 db1/base/16388/16397.6 | |
-rw------- 1 postgres postgres 1695744 Oct 5 17:29 db1/base/16388/16397_fsm | |
-rw------- 1 postgres postgres 40960 Oct 5 17:30 db1/base/16388/16397_vm | |
postgres@ubuntu:/tmp$ #the block size is 8k, so how many blocks in each chunk of 1gb. 1gb/8k = 131072 blocks | |
postgres@ubuntu:/tmp$ #lets write random data to block 10 of size 8k on chunk db1/base/16388/16397.1 | |
postgres@ubuntu:/tmp$ #dd if=/dev/urandom seek=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1 | |
postgres@ubuntu:/tmp$ p1 stop | |
waiting for server to shut down.... done | |
server stopped | |
postgres@ubuntu:/tmp$ p1 start | |
waiting for server to start.... done | |
server started | |
postgres@ubuntu:/tmp$ #just check all good till now before we corrupt block | |
postgres@ubuntu:/tmp$ psql -p 5433 demo | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
demo=# select * from t where col1 = -1; | |
col1 | col2 | |
------+------ | |
(0 rows) | |
demo=# set enable_indexscan to off; | |
SET | |
demo=# set enable_indexonlyscan to off; | |
SET | |
demo=# select * from t where col1 = -1; | |
col1 | col2 | |
------+------ | |
(0 rows) | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ p1 stop | |
waiting for server to shut down.... done | |
server stopped | |
postgres@ubuntu:/tmp$ dd if=/dev/urandom seek=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1 | |
1+0 records in | |
1+0 records out | |
8192 bytes (8.2 kB, 8.0 KiB) copied, 0.0001743 s, 47.0 MB/s | |
postgres@ubuntu:/tmp$ #now that we have corrupted the block on db1/base/16388/16397.1 we try to figure out problems in query | |
postgres@ubuntu:/tmp$ p1 start | |
waiting for server to start.... done | |
server started | |
postgres@ubuntu:/tmp$ psql -p 5432 demo | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
demo=# select * from t where col1 = -1; | |
col1 | col2 | |
------+------ | |
(0 rows) | |
demo=# explain analyze select * from t where col1 = -1; | |
QUERY PLAN | |
----------------------------------------------------------------------------------------------------------- | |
Index Scan using t_pkey on t (cost=0.57..8.59 rows=1 width=37) (actual time=0.025..0.025 rows=0 loops=1) | |
Index Cond: (col1 = '-1'::integer) | |
Planning Time: 0.055 ms | |
Execution Time: 0.046 ms | |
(4 rows) | |
demo=# -- coz index used and we corrupted table page, so we disable index scan for now | |
demo=# ; | |
demo=# set enable_indexscan to off; | |
SET | |
demo=# set enable_indexonlyscan to off; -- this will force seq scan ; | |
SET | |
demo=# explain analyze select * from t where col1 = -1; | |
QUERY PLAN | |
--------------------------------------------------------------------------------------------------------------- | |
Bitmap Heap Scan on t (cost=4.58..8.59 rows=1 width=37) (actual time=0.015..0.015 rows=0 loops=1) | |
Recheck Cond: (col1 = '-1'::integer) | |
-> Bitmap Index Scan on t_pkey (cost=0.00..4.58 rows=1 width=0) (actual time=0.013..0.013 rows=0 loops=1) | |
Index Cond: (col1 = '-1'::integer) | |
Planning Time: 0.053 ms | |
Execution Time: 0.048 ms | |
(6 rows) | |
demo=# set enable_bitmapscan to off; | |
SET | |
demo=# explain analyze select * from t where col1 = -1; | |
WARNING: page verification failed, calculated checksum 61673 but expected 6429 | |
ERROR: invalid page in block 131082 of relation base/16388/16397 | |
CONTEXT: parallel worker | |
demo=# -- since we had datachecksums enabled, we got the error which block on which object is having error | |
demo=# -- but remember 1 chunk of 1gb file only has 131072 blocks, but the error is on block 131082 which is 10 blocks past chunk 1. so it means on chunk 2 i.e db1/base/16388/16397.1 block 10 which is what we expect the problem to be at | |
demo=# -- lets query replica and see what we can do | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ psql -p 5433 demo | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
demo=# set enable_indexscan to off; | |
SET | |
demo=# set enable_indexonlyscan to off; | |
SET | |
demo=# set enable_bitmapscan to off; | |
SET | |
demo=# explain analyze select * from t where col1 = -1; QUERY PLAN | |
----------------------------------------------------------------------------------------------------------------------- | |
Gather (cost=1000.00..1355167.85 rows=1 width=37) (actual time=6078.972..6083.270 rows=0 loops=1) | |
Workers Planned: 2 | |
Workers Launched: 2 | |
-> Parallel Seq Scan on t (cost=0.00..1354167.75 rows=1 width=37) (actual time=6065.403..6065.403 rows=0 loops=3) | |
Filter: (col1 = '-1'::integer) | |
Rows Removed by Filter: 33333333 | |
Planning Time: 0.554 ms | |
JIT: | |
Functions: 6 | |
Options: Inlining true, Optimization true, Expressions true, Deforming true | |
Timing: Generation 0.329 ms, Inlining 136.889 ms, Optimization 7.492 ms, Emission 10.393 ms, Total 155.102 ms | |
Execution Time: 6094.496 ms | |
(12 rows) | |
demo=# -- great no errors since we only corrupted the block of primary and not replica. | |
demo=# -- so we should failover and rebuild the replica, but to tinker around, i will copy data from replica and rebuild indexes on primary for fun | |
demo=# select pg_relation_filepath('t'); -- find the path on disk for this table pg_relation_filepath | |
---------------------- | |
base/16388/16397 | |
(1 row) | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ p1 stop | |
waiting for server to shut down.... done | |
server stopped | |
postgres@ubuntu:/tmp$ p2 stop | |
waiting for server to shut down.... done | |
server stopped | |
postgres@ubuntu:/tmp$ #note the checksums of only chunk 2 differs | |
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397 | |
3599534708 1073741824 db1/base/16388/16397 | |
3599534708 1073741824 db2/base/16388/16397 | |
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397.1 | |
3722867617 1073741824 db1/base/16388/16397.1 | |
3638819912 1073741824 db2/base/16388/16397.1 | |
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397.2 | |
2189200680 1073741824 db1/base/16388/16397.2 | |
2189200680 1073741824 db2/base/16388/16397.2 | |
postgres@ubuntu:/tmp$ #so for fun i can either copy the whole 1gb file, but i will only copy the problem block | |
postgres@ubuntu:/tmp$ dd if=db2/base/16388/16397.1 seek=10 skip=10 bs=8k of=db1/base/16388/16397.1 conv=notrunc,noerror count=1 | |
1+0 records in | |
1+0 records out | |
8192 bytes (8.2 kB, 8.0 KiB) copied, 5.12e-05 s, 160 MB/s | |
postgres@ubuntu:/tmp$ cksum db*/base/16388/16397.1 | |
3638819912 1073741824 db1/base/16388/16397.1 | |
3638819912 1073741824 db2/base/16388/16397.1 | |
postgres@ubuntu:/tmp$ #note the checksums match and original checksum of db2 chunk 2 untampered | |
postgres@ubuntu:/tmp$ p1 start | |
waiting for server to start.... done | |
server started | |
postgres@ubuntu:/tmp$ p2 start | |
waiting for server to start.... done | |
server started | |
postgres@ubuntu:/tmp$ psql -p 5432 demo | |
psql (16.4 (Ubuntu 16.4-0ubuntu0.24.04.1)) | |
Type "help" for help. | |
demo=# set enable_bitmapscan to off; | |
SET | |
demo=# set enable_indexonlyscan to off; | |
SET | |
demo=# set enable_indexscan to off; | |
SET | |
demo=# -- lets try again; | |
demo=# explain analyze select * from t where col1 = -1; | |
QUERY PLAN | |
----------------------------------------------------------------------------------------------------------------------- | |
Gather (cost=1000.00..1355167.85 rows=1 width=37) (actual time=5173.036..5176.222 rows=0 loops=1) | |
Workers Planned: 2 | |
Workers Launched: 2 | |
-> Parallel Seq Scan on t (cost=0.00..1354167.75 rows=1 width=37) (actual time=5161.846..5161.846 rows=0 loops=3) | |
Filter: (col1 = '-1'::integer) | |
Rows Removed by Filter: 33333333 | |
Planning Time: 3.577 ms | |
JIT: | |
Functions: 6 | |
Options: Inlining true, Optimization true, Expressions true, Deforming true | |
Timing: Generation 0.546 ms, Inlining 138.306 ms, Optimization 7.388 ms, Emission 9.805 ms, Total 156.045 ms | |
Execution Time: 5193.682 ms | |
(12 rows) | |
demo=# -- coool we recovered the problem block by copying from replica | |
demo=# | |
\q | |
postgres@ubuntu:/tmp$ p1 stop | |
waiting for server to shut down.... done | |
server stopped | |
postgres@ubuntu:/tmp$ p2 stop | |
waiting for server to shut down.... done | |
server stopped |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
to check the contents of a file without the db, we can make use of pg_filedump and see what data we can recover, and what we cannot.
here we try to get the contents of block 10 on chunk2. with and without corruption