PostgreSQL 的流复制统计信息中记录了4个WAL日志的位置信息,这些信息是standby的receiver进程反馈给primary的wal sender进程的。
统计视图如下:
pipeline=# \d+ pg_stat_replicationView "pg_catalog.pg_stat_replication"Column | Type | Modifiers | Storage | Description------------------+--------------------------+-----------+----------+-------------pid | integer | | plain |usesysid | oid | | plain |usename | name | | plain |application_name | text | | extended |client_addr | inet | | main |client_hostname | text | | extended |client_port | integer | | plain |backend_start | timestamp with time zone | | plain |backend_xmin | xid | | plain |state | text | | extended |sent_location | pg_lsn | | plain |write_location | pg_lsn | | plain |flush_location | pg_lsn | | plain |replay_location | pg_lsn | | plain |sync_priority | integer | | plain |sync_state | text | | extended |View definition:SELECT s.pid,s.usesysid,u.rolname AS usename,s.application_name,s.client_addr,s.client_hostname,s.client_port,s.backend_start,s.backend_xmin,w.state,w.sent_location,w.write_location,w.flush_location,w.replay_location,w.sync_priority,w.sync_stateFROM pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, waiting, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin),pg_authid u,pg_stat_get_wal_senders() w(pid, state, sent_location, write_location, flush_location, replay_location, sync_priority, sync_state)WHERE s.usesysid = u.oid AND s.pid = w.pid;
实际上这几个位置信息是从函数pg_stat_get_wal_senders获取到的,
这个函数的信息如下
pipeline=# \df+ pg_stat_get_wal_sendersList of functions-[ RECORD 1 ]-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------Schema | pg_catalogName | pg_stat_get_wal_sendersResult data type | SETOF recordArgument data types | OUT pid integer, OUT state text, OUT sent_location pg_lsn, OUT write_location pg_lsn, OUT flush_location pg_lsn, OUT replay_location pg_lsn, OUT sync_priority integer, OUT sync_state textType | normalSecurity | invokerVolatility | stableOwner | postgresLanguage | internalSource code | pg_stat_get_wal_sendersDescription | statistics: information about currently active replication
pg_stat_get_wal_senders对应的源码如下
src/backend/replication/walsender.c
/** Returns activity of walsenders, including pids and xlog locations sent to* standby servers.*/Datumpg_stat_get_wal_senders(PG_FUNCTION_ARGS){.../* use volatile pointer to prevent code rearrangement */volatile WalSnd *walsnd = MyWalSnd;...state = walsnd->state; // 4个位置信息的值是这样得来的write = walsnd->write;flush = walsnd->flush;apply = walsnd->apply;...
walsnd的定义如下
src/include/replication/walsender_private.h
/** Each walsender has a WalSnd struct in shared memory.*/typedef struct WalSnd{pid_t pid; /* this walsender's process id, or 0 */WalSndState state; /* this walsender's state */XLogRecPtr sentPtr; /* WAL has been sent up to this point */bool needreload; /* does currently-open file need to be* reloaded? *//** The xlog locations that have been written, flushed, and applied by* standby-side. These may be invalid if the standby-side has not offered* values yet.*/XLogRecPtr write;XLogRecPtr flush;XLogRecPtr apply;/* Protects shared variables shown above. */slock_t mutex;/** Pointer to the walsender's latch. Used by backends to wake up this* walsender when it has work to do. NULL if the walsender isn't active.*/Latch *latch;/** The priority order of the standby managed by this WALSender, as listed* in synchronous_standby_names, or 0 if not-listed. Protected by* SyncRepLock.*/int sync_standby_priority;} WalSnd;
src/backend/replication/walsender.c
/** Regular reply from standby advising of WAL positions on standby server.*/static voidProcessStandbyReplyMessage(void){......XLogRecPtr writePtr,flushPtr,applyPtr;bool replyRequested;/* the caller already consumed the msgtype byte */writePtr = pq_getmsgint64(&reply_message); // 接收来自walreceiver的位置信息flushPtr = pq_getmsgint64(&reply_message);applyPtr = pq_getmsgint64(&reply_message);(void) pq_getmsgint64(&reply_message); /* sendTime; not used ATM */replyRequested = pq_getmsgbyte(&reply_message);elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s",(uint32) (writePtr >> 32), (uint32) writePtr,(uint32) (flushPtr >> 32), (uint32) flushPtr,(uint32) (applyPtr >> 32), (uint32) applyPtr,replyRequested ? " (reply requested)" : "");....../** Update shared state for this WalSender process based on reply data from* standby.*/{/* use volatile pointer to prevent code rearrangement */volatile WalSnd *walsnd = MyWalSnd;SpinLockAcquire(&walsnd->mutex);walsnd->write = writePtr; // 这几个数据实际上是walsender进程从walreceiver进程接收到的walsnd->flush = flushPtr;walsnd->apply = applyPtr;SpinLockRelease(&walsnd->mutex);}....
walreceiver的位置信息是如何计算的,数据结构
src/backend/replication/walreceiver.c
/** LogstreamResult indicates the byte positions that we have already* written/fsynced.*/static struct{XLogRecPtr Write; /* last byte + 1 written out in the standby */XLogRecPtr Flush; /* last byte + 1 flushed in the standby */} LogstreamResult;
// 发送位置信息给walsender
....../** Send reply message to primary, indicating our current XLOG positions, oldest* xmin and the current time.** If 'force' is not set, the message is only sent if enough time has* passed since last status update to reach wal_receiver_status_interval.* If wal_receiver_status_interval is disabled altogether and 'force' is* false, this is a no-op.** If 'requestReply' is true, requests the server to reply immediately upon* receiving this message. This is used for heartbearts, when approaching* wal_receiver_timeout.*/static voidXLogWalRcvSendReply(bool force, bool requestReply){...../* Construct a new message */writePtr = LogstreamResult.Write;flushPtr = LogstreamResult.Flush;applyPtr = GetXLogReplayRecPtr(NULL);resetStringInfo(&reply_message);pq_sendbyte(&reply_message, 'r');pq_sendint64(&reply_message, writePtr);pq_sendint64(&reply_message, flushPtr);pq_sendint64(&reply_message, applyPtr);pq_sendint64(&reply_message, GetCurrentIntegerTimestamp());pq_sendbyte(&reply_message, requestReply ? 1 : 0);......
调用write接口和fsync接口。
/** Write XLOG data to disk.*/static voidXLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr){......int startoff;int byteswritten;while (nbytes > 0){int segbytes;if (recvFile < 0 || !XLByteInSeg(recptr, recvSegNo)){bool use_existent;/** fsync() and close current file before we switch to next one. We* would otherwise have to reopen this file to fsync it later*/if (recvFile >= 0){char xlogfname[MAXFNAMELEN];XLogWalRcvFlush(false); // 调用fsync数据,指刷到磁盘,并更新flush位置为老的write位置....../* OK to write the logs */errno = 0;byteswritten = write(recvFile, buf, segbytes); // 调用write接口,指刷到os dirty page cacheif (byteswritten <= 0){/* if write didn't set errno, assume no disk space */if (errno == 0)errno = ENOSPC;ereport(PANIC,(errcode_for_file_access(),errmsg("could not write to log segment %s ""at offset %u, length %lu: %m",XLogFileNameP(recvFileTLI, recvSegNo),recvOff, (unsigned long) segbytes)));}/* Update state for write */recptr += byteswritten; // 修正最新write位置recvOff += byteswritten;nbytes -= byteswritten;buf += byteswritten;LogstreamResult.Write = recptr; // 更新Write位置...
/** Flush the log to disk.** If we're in the midst of dying, it's unwise to do anything that might throw* an error, so we skip sending a reply in that case.*/static voidXLogWalRcvFlush(bool dying){if (LogstreamResult.Flush < LogstreamResult.Write){/* use volatile pointer to prevent code rearrangement */volatile WalRcvData *walrcv = WalRcv;issue_xlog_fsync(recvFile, recvSegNo);LogstreamResult.Flush = LogstreamResult.Write; -- 将Flush改为老的Write值......
fsync wal接口调用
src/backend/access/transam/xlog.c
/** Issue appropriate kind of fsync (if any) for an XLOG output file.** 'fd' is a file descriptor for the XLOG file to be fsync'd.* 'log' and 'seg' are for error reporting purposes.*/voidissue_xlog_fsync(int fd, XLogSegNo segno){switch (sync_method){case SYNC_METHOD_FSYNC:if (pg_fsync_no_writethrough(fd) != 0)ereport(PANIC,(errcode_for_file_access(),errmsg("could not fsync log file %s: %m",XLogFileNameP(ThisTimeLineID, segno))));break;#ifdef HAVE_FSYNC_WRITETHROUGHcase SYNC_METHOD_FSYNC_WRITETHROUGH:if (pg_fsync_writethrough(fd) != 0)ereport(PANIC,(errcode_for_file_access(),errmsg("could not fsync write-through log file %s: %m",XLogFileNameP(ThisTimeLineID, segno))));break;#endif#ifdef HAVE_FDATASYNCcase SYNC_METHOD_FDATASYNC:if (pg_fdatasync(fd) != 0)ereport(PANIC,(errcode_for_file_access(),errmsg("could not fdatasync log file %s: %m",XLogFileNameP(ThisTimeLineID, segno))));break;#endifcase SYNC_METHOD_OPEN:case SYNC_METHOD_OPEN_DSYNC:/* write synced it already */break;default:elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);break;}}
以上调用详见src/backend/storage/file/fd.c
现在可以小结一下了
sent_location 已发送给standby的位置(standby请求的最新位置)
write_location standby已接收到,并已调用write刷到OS DIRTY PAGE的WAL最新位置
flush_location standby已接收到,并已调用已通过wal_sync_method配置的fsync接口刷到disk的WAL最新位置
replay_location standby已接收到,并已apply进行恢复的WAL最新位置