连接数使用超过95%告警,最大连接数和当前连接数
??select (select count(*) from pg_stat_activity) /(select setting from pg_settings where name = 'max_connections') ::numeric >0.95;
存在空闲连接idle in transaction大于5秒的连,如果长期存在这样的连接说明应用有问题,并且会造成数据库出现异常
?select count(*) from pg_stat_activity where state = 'idle in transaction' and now()-query_start>'5s'::interval);
长事务,超过30分钟的慢sql
?select count(*) from pg_stat_activity where state <> 'idle' and (backend_xid is not null or backend_xmin is not null) and now()-xact_start > interval '1800 sec'::interval;
数据库占用空间
?select pg_size_pretty(pg_database_size('postgres')); ?select datname, pg_size_pretty(pg_database_size(datname)) from pg_database;
数据库中存在长时间阻塞的锁,阻塞超过3分钟
?select ??kl.pid as 阻塞pid, ?a.usename as blocked_user, ?ka.query as 阻塞sql, ?ka.state as 阻塞sql状态, ?now()- ka.query_start as 阻塞语句执行时长,--阻塞的语句执行时长 ??bl.pid as 被阻塞pid, ?ka.usename as 被阻塞用户, ?a.query as 被阻塞sql, ?now()- a.query_start as 被阻塞语句执行时长--被阻塞语句执行时长 ?from ?pg_locks bl ?join pg_stat_activity a on ?a.pid = bl.pid ?join pg_locks kl on ?kl.transactionid = bl.transactionid ?and kl.pid != bl.pid ?join pg_stat_activity ka on ?ka.pid = kl.pid ?where ?not bl.granted ??and now()- ka.query_start >'60s'::interval;--阻塞时长超过1分钟 ? ? ??--阻塞pid对应的是数据库中pid,可以使用select pg_terminate_backend(pid);进行kill
坏元组超过10000行,且占比超过20%的表
?select current_database(), ??schemaname||'.'||relname, ??n_dead_tup, ??n_live_tup, ??round(n_dead_tup*100/ (n_live_tup )::numeric,2)||'%' AS dead_tup_ratio ??FROM ??pg_stat_all_tables ??WHERE ??n_dead_tup >= 10000 and n_live_tup>0 and (n_dead_tup)/(n_live_tup)::numeric>0.25 and schemaname not in ('pg_toast','pg_catalog','information_schema') ??ORDER BY dead_tup_ratio DESC ??LIMIT 10; ?
wal日志生成量
?--'AD/FAFFF2A8'是5分钟前select pg_current_wal_lsn()获取的值 ?select pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),'AD/FAFFF2A8'));
表缓存命中率和索引缓存命中率,命中率低于80%说明缓存不足
注意,不常用的库缓存可能为0
?SELECT'index hit rate' AS name, ?(sum(idx_blks_hit)) / nullif(sum(idx_blks_hit + idx_blks_read),0) AS ratio ?FROM pg_statio_user_indexes ?UNION ALL ?SELECT ?'table hit rate' AS name, ?sum(heap_blks_hit) / nullif(sum(heap_blks_hit) + sum(heap_blks_read),0) AS ratio ?FROM pg_statio_user_tables;
数据库年龄监控
当age达到2亿会触发自动清理,如果大于5亿则告警
?select datname,age(datfrozenxid),pg_size_pretty(pg_database_size(oid)) from pg_database where age(datfrozenxid) > 500000000 order by age(datfrozenxid) desc limit 10 ;
表年龄(用于查看详情,默认数据库年龄会存储表年龄最大的一条)
?select sp.nspname,relname,age(relfrozenxid), pg_size_pretty(pg_table_size(cl.oid)) from pg_class cl join pg_namespace sp on cl.relnamespace = sp.oid where relkind in ('t','r') order by age(relfrozenxid) desc limit 10; ? ?-- 说明:当age到达2亿(默认)时触发自动清理,期间会大量占用系统资源。提前做好监控避免在业务高峰时发生。可在库级别操作,也可在表基本操作。
事务提交,事务回滚,全表扫描,索引扫描,插入记录,更新记录
?select sum(xact_commit),sum(xact_rollback),sum(tup_returned),sum(tup_fetched),sum(tup_inserted),sum(tup_updated),sum(tup_deleted),sum(conflicts),sum(deadlocks) from pg_stat_database; ?-- pg_stat_get_db_xact_commit 为stable函数,一个事务中两次调用之间只执行一次,所以需要外部多次执行。
回滚率计算:select xact_rollback/(xact_commit+xact_rollback),如果超过10%则代表回滚率较高
过去5分钟内生成wal个数,可以判断当前数据库的dml频繁程度
?select count(1) from pg_catalog.pg_ls_waldir() where modification > CURRENT_TIMESTAMP - '5 minutes' :: INTERVAL ;
wal写入速率
lsn是一个递增的值
?SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - '0/0' as wal_lsn;
现场使用arrs或者高可用,出现异常后并不知情,可监控复制状态,以及日志堆积大小判断当前复制情况
复制状态,true表示正常,false表示异常
??select active from pg_replication_slots;
备库applay延迟,以及堆积情况
?select ?application_name,client_addr,client_hostname,client_port,state,sync_priority,sync_state,pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn)), ?pg_wal_lsn_diff(pg_current_wal_lsn(),write_lsn) write_delay, ?pg_wal_lsn_diff(pg_current_wal_lsn(),flush_lsn) flush_delay, ?pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn) replay_dely ??from pg_stat_replication ;
arrs复制延迟,以及堆积情况
??select slot_name, plugin, slot_type, ?temporary, active, active_pid, ?pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) from ?pg_replication_slots ;
备库applay延迟,复制堆积超过1GB
?select ?application_name,client_addr,client_hostname,client_port,state,sync_priority, ?sync_state,pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_location(),replay_location)), ?pg_xlog_location_diff(pg_current_xlog_location(),write_location) write_delay, ?pg_xlog_location_diff(pg_current_xlog_location(),flush_location) flush_delay, ?pg_xlog_location_diff(pg_current_xlog_location(),replay_location) replay_dely ??from pg_stat_replication; ??select slot_name, plugin, slot_type, ??active, active_pid, ?pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn)) from ?pg_replication_slots;
主从流复制延时时间 (从库执行)
?SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END ?10 版本及以后 ?SELECT CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END; ?-- 主从复制延迟字节 (主库执行) ?select greatest(0,pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) from pg_stat_replication ; where client_addr = '从库
数据库需要安装插件pg_stat_statements
在数据库中当前频繁执行的SQL TOP5
?SELECT C.rolname, ??b.datname, ??A.total_time / A.calls per_call_time, ??A.* ?FROM ??pg_stat_statements A, ??pg_database b, ??pg_authid C ?WHERE ??A.userid = C.oid ??AND A.dbid = b.oid ?ORDER BY ??A.total_time DESC ??LIMIT 5
最耗时 SQL,单次调用最耗时 SQL TOP5
?select userid::regrole, dbid, query from pg_stat_statements order by mean_time desc limit 5;
最耗共享内存 SQL
?select userid::regrole, dbid, query from pg_stat_statements order by (shared_blks_hit+shared_blks_dirtied) desc limit 5;
总最耗IO SQL TOP5
?select userid::regrole, dbid, query from pg_stat_statements order by (blk_read_time+blk_write_time) desc limit 5;
最耗IO SQL,单次调用最耗IO SQL TOP5
?select userid::regrole, dbid, query from pg_stat_statements order by (blk_read_time+blk_write_time)/calls desc limit 5;
QPS
??with ??a as(select sum(calls)s from pg_stat_statements), ??b as(select sum(calls)s from pg_stat_statements,pg_sleep(1)) ??select b.s-a.s --QPS ??from a,b;
长期(30天)未更新统计信息的表,坏元组比例超过10%就应该触发更新统计信息(警告)
? ?SELECT ??to_char((now() - last_analyze), 'DD'), to_char((now() - last_autoanalyze),'DD' ) , ?current_database(), ??schemaname||'.'||relname,n_live_tup,n_dead_tup, ??round(n_dead_tup*100/n_live_tup::numeric,2)||'%' as dead_tup_ratio ?FROM ??pg_stat_all_tables ?WHERE ??schemaname not in( 'pg_toast','pg_catalog','information_schema' ) ?AND (last_analyze IS null OR to_char((now() - last_analyze), 'DD') :: NUMERIC > 30) ?AND (last_autovacuum IS null OR to_char((now() - last_autovacuum),'DD') :: NUMERIC > 30) ?AND (last_autoanalyze IS null OR to_char((now() - last_autoanalyze),'DD' ) :: NUMERIC > 30) ?and n_dead_tup >= 10000 and n_live_tup>0 and (n_dead_tup)/(n_live_tup)::numeric > 0.1;
未使用的索引
?SELECT ?schemaname || '.' || relname AS table, ?indexrelname AS index, ?pg_size_pretty(pg_relation_size(i.indexrelid)) AS index_size, ?idx_scan as index_scans ?FROM pg_stat_user_indexes ui ?JOIN pg_index i ON ui.indexrelid = i.indexrelid ?WHERE NOT indisunique ?AND idx_scan < 5 ?AND pg_relation_size(relid) > 5 * 8192 ?ORDER BY pg_relation_size(i.indexrelid) / nullif(idx_scan, 0) DESC NULLS FIRST, ?pg_relation_size(i.indexrelid) DESC; ? --修改后: ?SELECT ?schemaname || '.' || relname AS table, ?indexrelname AS index, ?pg_size_pretty(pg_relation_size(i.indexrelid)) AS index_size,pg_size_pretty(pg_relation_size(relid)), ?idx_scan as index_scans ?FROM pg_stat_user_indexes ui ?JOIN pg_index i ON ui.indexrelid = i.indexrelid ?WHERE NOT indisunique ?AND idx_scan < 5 ?AND pg_relation_size(relid) > 5 * 8192*1000 ?ORDER BY pg_relation_size(i.indexrelid) desc limit 5;
表未创建主键
?SELECT ?current_database (),ns.nspname,obj.relname ?FROM ?pg_class obj,pg_namespace ns ?WHERE ?obj.relnamespace = ns.oid ?AND obj.relkind = 'r' ?AND ns.nspname NOT IN ('pg_toast','pg_catalog','information_schema') ?AND obj.oid NOT IN (SELECTobj.oid FROMpg_class obj,pg_index idxWHEREobj.oid = idx.indrelidAND idx.indisprimary = 'true') ?limit 10;
数据库中存在触发器
?SELECT ?current_database (), ?ns.nspname, ?rel.relname, ?tri.tgname ?FROM ?pg_class rel, ?pg_namespace ns, ?pg_trigger tri ?WHERE ?ns.nspname NOT IN ('pg_toast','pg_catalog','information_schema') ?AND rel.oid = tri.tgrelid ?AND rel.relnamespace = ns.oid ?AND rel.relhastriggers = true and tgisinternal= = false;
上班期间存在备份进程
?select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname,query from pg_stat_activity st where state='active' and application_name='pg_dump';
上班期间存在创建索引进程
?select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname ?from pg_stat_activity st where state='active' and (st.query ilike 'create index%' or st.query ilike 'create uniuqe index%');
上班期间重建索引
?select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname from pg_stat_activity st where state='active' and query ilike 'reindex%'; ?
vacuum full进程
?select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname,query from pg_stat_activity st where state='active' and query ilike 'vacuum full%';
重复索引
?select allindx.schemaname,allindx.relname,allindx.indexrelname,ind.indexdef FROM pg_stat_all_indexes allindx,pg_indexes ind where allindx.relname=ind.tablename and allindx.indexrelname=ind.indexname and allindx.indexrelid :: VARCHAR ?IN (SELECT regexp_split_to_table(indexs, $$,$$) ?FROM(SELECT string_agg (idx :: VARCHAR, $$,$$) AS indexs ?FROM( ?SELECT indexrelid :: oid AS idx, ?(indrelid :: TEXT || $$BB$$ || indclass :: TEXT || $$BB$$ || indkey :: TEXT || $$BB$$ || COALESCE (indexprs :: TEXT, $$BB$$) || $$BB$$ || COALESCE (indpred :: TEXT, $$BB$$)) AS val FROM pg_index ) sub ?GROUP BYval HAVINGCOUNT (*) > 1 ?ORDER BY SUM (pg_relation_size(idx)) desc ?) arr) and allindx.schemaname not in ('pg_toast','pg_catalog');