PostgreSQL pg_stat_replication sent_location, write_location, flush_location, replay_location的差别

January 12, 2016, 8:40 am

PostgreSQL 的流复制统计信息中记录了4个WAL日志的位置信息，这些信息是standby的receiver进程反馈给primary的wal sender进程的。

统计视图如下：


pipeline=# \d+ pg_stat_replication
                      View "pg_catalog.pg_stat_replication"
      Column      |           Type           | Modifiers | Storage  | Description 
------------------+--------------------------+-----------+----------+-------------
 pid              | integer                  |           | plain    | 
 usesysid         | oid                      |           | plain    | 
 usename          | name                     |           | plain    | 
 application_name | text                     |           | extended | 
 client_addr      | inet                     |           | main     | 
 client_hostname  | text                     |           | extended | 
 client_port      | integer                  |           | plain    | 
 backend_start    | timestamp with time zone |           | plain    | 
 backend_xmin     | xid                      |           | plain    | 
 state            | text                     |           | extended | 
 sent_location    | pg_lsn                   |           | plain    | 
 write_location   | pg_lsn                   |           | plain    | 
 flush_location   | pg_lsn                   |           | plain    | 
 replay_location  | pg_lsn                   |           | plain    | 
 sync_priority    | integer                  |           | plain    | 
 sync_state       | text                     |           | extended | 
View definition:
 SELECT s.pid,
    s.usesysid,
    u.rolname AS usename,
    s.application_name,
    s.client_addr,
    s.client_hostname,
    s.client_port,
    s.backend_start,
    s.backend_xmin,
    w.state,
    w.sent_location,
    w.write_location,
    w.flush_location,
    w.replay_location,
    w.sync_priority,
    w.sync_state
   FROM pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, waiting, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin),
    pg_authid u,
    pg_stat_get_wal_senders() w(pid, state, sent_location, write_location, flush_location, replay_location, sync_priority, sync_state)
  WHERE s.usesysid = u.oid AND s.pid = w.pid;

实际上这几个位置信息是从函数pg_stat_get_wal_senders获取到的，

这个函数的信息如下

pipeline=# \df+ pg_stat_get_wal_senders

List of functions

-[ RECORD 1 ]-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Schema | pg_catalog

Name | pg_stat_get_wal_senders

Result data type | SETOF record

Argument data types | OUT pid integer, OUT state text, OUT sent_location pg_lsn, OUT write_location pg_lsn, OUT flush_location pg_lsn, OUT replay_location pg_lsn, OUT sync_priority integer, OUT sync_state text

Type | normal

Security | invoker

Volatility | stable

Owner | postgres

Language | internal

Source code | pg_stat_get_wal_senders

Description | statistics: information about currently active replication

pg_stat_get_wal_senders对应的源码如下

src/backend/replication/walsender.c


/*
 * Returns activity of walsenders, including pids and xlog locations sent to
 * standby servers.
 */
Datum
pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
{
...
                        /* use volatile pointer to prevent code rearrangement */
                        volatile WalSnd *walsnd = MyWalSnd;
...
                state = walsnd->state;  // 4个位置信息的值是这样得来的
                write = walsnd->write;
                flush = walsnd->flush;
                apply = walsnd->apply;
...

walsnd的定义如下

src/include/replication/walsender_private.h


/*
 * Each walsender has a WalSnd struct in shared memory.
 */
typedef struct WalSnd
{
        pid_t           pid;                    /* this walsender's process id, or 0 */
        WalSndState state;                      /* this walsender's state */
        XLogRecPtr      sentPtr;                /* WAL has been sent up to this point */
        bool            needreload;             /* does currently-open file need to be
                                                                 * reloaded? */

        /*
         * The xlog locations that have been written, flushed, and applied by
         * standby-side. These may be invalid if the standby-side has not offered
         * values yet.
         */
        XLogRecPtr      write;
        XLogRecPtr      flush;
        XLogRecPtr      apply;

        /* Protects shared variables shown above. */
        slock_t         mutex;

        /*
         * Pointer to the walsender's latch. Used by backends to wake up this
         * walsender when it has work to do. NULL if the walsender isn't active.
         */
        Latch      *latch;

        /*
         * The priority order of the standby managed by this WALSender, as listed
         * in synchronous_standby_names, or 0 if not-listed. Protected by
         * SyncRepLock.
         */
        int                     sync_standby_priority;
} WalSnd;

src/backend/replication/walsender.c


/*
 * Regular reply from standby advising of WAL positions on standby server.
 */
static void
ProcessStandbyReplyMessage(void)
{
......
        XLogRecPtr      writePtr,
                                flushPtr,
                                applyPtr;
        bool            replyRequested;

        /* the caller already consumed the msgtype byte */
        writePtr = pq_getmsgint64(&reply_message); // 接收来自walreceiver的位置信息
        flushPtr = pq_getmsgint64(&reply_message);
        applyPtr = pq_getmsgint64(&reply_message);
        (void) pq_getmsgint64(&reply_message);          /* sendTime; not used ATM */
        replyRequested = pq_getmsgbyte(&reply_message);

        elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s",
                 (uint32) (writePtr >> 32), (uint32) writePtr, 
                 (uint32) (flushPtr >> 32), (uint32) flushPtr,
                 (uint32) (applyPtr >> 32), (uint32) applyPtr,
                 replyRequested ? " (reply requested)" : "");
......
        /*
         * Update shared state for this WalSender process based on reply data from
         * standby.
         */
        {
                /* use volatile pointer to prevent code rearrangement */
                volatile WalSnd *walsnd = MyWalSnd;

                SpinLockAcquire(&walsnd->mutex);
                walsnd->write = writePtr;  // 这几个数据实际上是walsender进程从walreceiver进程接收到的
                walsnd->flush = flushPtr;
                walsnd->apply = applyPtr;
                SpinLockRelease(&walsnd->mutex);
        }
....

walreceiver的位置信息是如何计算的，数据结构

src/backend/replication/walreceiver.c


/*
 * LogstreamResult indicates the byte positions that we have already
 * written/fsynced.
 */
static struct
{
        XLogRecPtr      Write;                  /* last byte + 1 written out in the standby */
        XLogRecPtr      Flush;                  /* last byte + 1 flushed in the standby */
}       LogstreamResult;

// 发送位置信息给walsender


......
/*
 * Send reply message to primary, indicating our current XLOG positions, oldest
 * xmin and the current time.
 *
 * If 'force' is not set, the message is only sent if enough time has
 * passed since last status update to reach wal_receiver_status_interval.
 * If wal_receiver_status_interval is disabled altogether and 'force' is
 * false, this is a no-op.
 *
 * If 'requestReply' is true, requests the server to reply immediately upon
 * receiving this message. This is used for heartbearts, when approaching
 * wal_receiver_timeout.
 */
static void
XLogWalRcvSendReply(bool force, bool requestReply)
{
.....
        /* Construct a new message */
        writePtr = LogstreamResult.Write;
        flushPtr = LogstreamResult.Flush;
        applyPtr = GetXLogReplayRecPtr(NULL);

        resetStringInfo(&reply_message);
        pq_sendbyte(&reply_message, 'r');
        pq_sendint64(&reply_message, writePtr);
        pq_sendint64(&reply_message, flushPtr);
        pq_sendint64(&reply_message, applyPtr);
        pq_sendint64(&reply_message, GetCurrentIntegerTimestamp());
        pq_sendbyte(&reply_message, requestReply ? 1 : 0);
......

调用write接口和fsync接口。


/*
 * Write XLOG data to disk.
 */
static void
XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
{
......
        int                     startoff;
        int                     byteswritten;

        while (nbytes > 0)
        {
                int                     segbytes;

                if (recvFile < 0 || !XLByteInSeg(recptr, recvSegNo))
                {
                        bool            use_existent;

                        /*
                         * fsync() and close current file before we switch to next one. We
                         * would otherwise have to reopen this file to fsync it later
                         */
                        if (recvFile >= 0)
                        {
                                char            xlogfname[MAXFNAMELEN];

                                XLogWalRcvFlush(false);  // 调用fsync数据，指刷到磁盘，并更新flush位置为老的write位置
......
                /* OK to write the logs */
                errno = 0;

                byteswritten = write(recvFile, buf, segbytes);  // 调用write接口，指刷到os dirty page cache
                if (byteswritten <= 0)
                {
                        /* if write didn't set errno, assume no disk space */
                        if (errno == 0)
                                errno = ENOSPC;
                        ereport(PANIC,
                                        (errcode_for_file_access(),
                                         errmsg("could not write to log segment %s "
                                                        "at offset %u, length %lu: %m",
                                                        XLogFileNameP(recvFileTLI, recvSegNo),
                                                        recvOff, (unsigned long) segbytes)));
                }

                /* Update state for write */
                recptr += byteswritten;  // 修正最新write位置

                recvOff += byteswritten;
                nbytes -= byteswritten;
                buf += byteswritten;

                LogstreamResult.Write = recptr;  // 更新Write位置
...


/*
 * Flush the log to disk.
 *
 * If we're in the midst of dying, it's unwise to do anything that might throw
 * an error, so we skip sending a reply in that case.
 */
static void
XLogWalRcvFlush(bool dying)
{
        if (LogstreamResult.Flush < LogstreamResult.Write)
        {
                /* use volatile pointer to prevent code rearrangement */
                volatile WalRcvData *walrcv = WalRcv;

                issue_xlog_fsync(recvFile, recvSegNo);

                LogstreamResult.Flush = LogstreamResult.Write;  -- 将Flush改为老的Write值
......

fsync wal接口调用

src/backend/access/transam/xlog.c


/*
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
 */
void
issue_xlog_fsync(int fd, XLogSegNo segno)
{
        switch (sync_method)
        {
                case SYNC_METHOD_FSYNC:
                        if (pg_fsync_no_writethrough(fd) != 0)
                                ereport(PANIC,
                                                (errcode_for_file_access(),
                                                 errmsg("could not fsync log file %s: %m",
                                                                XLogFileNameP(ThisTimeLineID, segno))));
                        break;
#ifdef HAVE_FSYNC_WRITETHROUGH
                case SYNC_METHOD_FSYNC_WRITETHROUGH:
                        if (pg_fsync_writethrough(fd) != 0)
                                ereport(PANIC,
                                                (errcode_for_file_access(),
                                          errmsg("could not fsync write-through log file %s: %m",
                                                         XLogFileNameP(ThisTimeLineID, segno))));
                        break;
#endif
#ifdef HAVE_FDATASYNC
                case SYNC_METHOD_FDATASYNC:
                        if (pg_fdatasync(fd) != 0)
                                ereport(PANIC,
                                                (errcode_for_file_access(),
                                                 errmsg("could not fdatasync log file %s: %m",
                                                                XLogFileNameP(ThisTimeLineID, segno))));
                        break;
#endif
                case SYNC_METHOD_OPEN:
                case SYNC_METHOD_OPEN_DSYNC:
                        /* write synced it already */
                        break;
                default:
                        elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
                        break;
        }
}

以上调用详见src/backend/storage/file/fd.c

现在可以小结一下了

sent_location 已发送给standby的位置(standby请求的最新位置)

write_location standby已接收到，并已调用write刷到OS DIRTY PAGE的WAL最新位置

flush_location standby已接收到，并已调用已通过wal_sync_method配置的fsync接口刷到disk的WAL最新位置

replay_location standby已接收到，并已apply进行恢复的WAL最新位置

↧

gcc 更新

January 12, 2016, 9:44 am

≫ Next: llvm, clang

≪ Previous: PostgreSQL pg_stat_replication sent_location, write_location, flush_location, replay_location的差别

下载新版本

https://gcc.gnu.org/mirrors.html

解压

#tar -xvzf gcc-4.9.3.tar.gz

#cd gcc-4.9.3

下载依赖包

./contrib/download_prerequisites

安装依赖包

cd contrib

drwxr-xr-x 16 digoal users 4096 Jan 12 17:11 gmp-4.3.2

drwxr-xr-x 7 digoal users 20480 Jan 12 17:14 mpfr-2.4.2

drwxr-xr-x 5 digoal users 4096 Jan 12 17:15 mpc-0.8.1

drwxr-xr-x 9 digoal users 20480 Jan 12 17:18 isl-0.12.2

drwxr-xr-x 15 digoal users 4096 Jan 12 17:20 cloog-0.18.1

编译gcc

#export LD_LIBRARY_PATH=/u02/digoal/cloog/lib:/u02/digoal/gmp/lib:/u02/digoal/isl/lib:/u02/digoal/mpc/lib:/u02/digoal/mpfr/lib:$LD_LIBRARY_PATH

#./configure --prefix=/u02/digoal/gcc4.9.3 --with-mpc=/u02/digoal/mpc --with-mpfr=/u02/digoal/mpfr --with-gmp=/u02/digoal/gmp --with-cloog=/u02/digoal/cloog --with-isl=/u02/digoal/isl --disable-isl-version-check --disable-multilib --disable-libatomic

make -j 32

make install -j 32

将环境变量加入/etc/profile

export LD_LIBRARY_PATH=/u02/digoal/gcc4.9.3/lib:/u02/digoal/cloog/lib:/u02/digoal/gmp/lib:/u02/digoal/isl/lib:/u02/digoal/mpc/lib:/u02/digoal/mpfr/lib:$LD_LIBRARY_PATH

export PATH=/u02/digoal/gcc4.9.3/bin:$PATH

修改ld.so.conf

# vi /etc/ld.so.conf

/u02/digoal/gcc4.9.3/lib

/u02/digoal/cloog/lib

/u02/digoal/gmp/lib

/u02/digoal/isl/lib

/u02/digoal/mpc/lib

u02/digoal/mpfr/lib

# ldconfig

[参考]

1. https://gcc.gnu.org/install/prerequisites.html

↧

llvm, clang

January 15, 2016, 7:29 am

≫ Next: 图数据库 CayLey + PostgreSQL

≪ Previous: gcc 更新

gcc

http://blog.163.com/digoal@126/blog/static/163877040201601313814429/

export PATH=/u02/dege.zzz/gcc4.9.3/bin:$PATH

export LD_LIBRARY_PATH=/u02/dege.zzz/gcc4.9.3/lib64:/u02/dege.zzz/gcc4.9.3/lib:$LD_LIBRARY_PATH

cmake

wget https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz

tar -zxvf cmake-3.4.1.tar.gz

cd cmake-3.4.1

./configure --prefix=/u02/dege.zzz/cmake

make

make install

export PATH=/u02/dege.zzz/cmake/bin:$PATH

python

wget

https://www.python.org/ftp/python/2.7.11/Python-2.7.11.tar.xz

tar -xvf Python-2.7.11.tar.xz

cd Python-2.7.11

./configure --prefix=/u02/dege.zzz/python2.7.11 --enable-shared

make -j 32

make install -j 32

export PATH=/u02/dege.zzz/python2.7.11/bin:$PATH

export LD_LIBRARY_PATH=/u02/dege.zzz/python2.7.11/lib:$LD_LIBRARY_PATH

vi /etc/ld.so.conf

/u02/dege.zzz/python2.7.11/lib

llvm, clang

wget http://llvm.org/releases/3.7.1/llvm-3.7.1.src.tar.xz

wget http://llvm.org/releases/3.7.1/cfe-3.7.1.src.tar.xz

wget http://llvm.org/releases/3.7.1/compiler-rt-3.7.1.src.tar.xz

wget http://llvm.org/releases/3.7.1/clang-tools-extra-3.7.1.src.tar.xz

wget http://llvm.org/releases/3.7.1/libcxx-3.7.1.src.tar.xz

tar -xvf llvm-3.7.1.src.tar.xz

tar -xvf cfe-3.7.1.src.tar.xz

tar -xvf compiler-rt-3.7.1.src.tar.xz

tar -xvf clang-tools-extra-3.7.1.src.tar.xz

tar -xvf libcxx-3.7.1.src.tar.xz

mv cfe-3.7.1.src clang

mv clang/ llvm-3.7.1.src/tools/

mv clang-tools-extra-3.7.1.src extra

mv extra/ llvm-3.7.1.src/tools/clang/

mv compiler-rt-3.7.1.src compiler-rt

mv compiler-rt llvm-3.7.1.src/projects/

mkdir mybuild

cd mybuild

cmake /u02/dege.zzz/soft_bak/llvm-3.7.1.src

安装

cmake --build .

cmake -DCMAKE_INSTALL_PREFIX=/u02/dege.zzz/llvm -P cmake_install.cmake

或

/u02/dege.zzz/soft_bak/llvm-3.7.1.src/configure --prefix=/u02/dege.zzz/llvm --enable-optimized --enable-targets=host-only CC=gcc CXX=g++

make -j 32

make install -j 32

export PATH=/u02/dege.zzz/llvm/bin:$PATH

export LD_LIBRARY_PATH=/u02/dege.zzz/llvm/lib:$LD_LIBRARY_PATH

# vi /etc/ld.so.conf

/u02/dege.zzz/llvm/lib

# ldconfig

[参考]

http://btorpey.github.io/blog/2015/01/02/building-clang/

↧

图数据库 CayLey + PostgreSQL

January 18, 2016, 12:50 am

≫ Next: clang编译 PostgreSQL

≪ Previous: llvm, clang

CayLey是GO语言写的一个图数据库引擎，支持RESTful API，内置查询编辑器和可视化，支持MQL和JAVASCRIPT查询接口，后端存储支持文件格式，PostgreSQL，mongodb，LevelDB，Bolt。模块化设计，扩展后端存储非常容易。

本文将以PostgreSQL为例，演示一下CayLey的使用。

图数据库 CayLey + PostgreSQL - 德哥@Digoal - PostgreSQL research

安装go：

yum install -y go

执行以下命令，克隆cayley和依赖：

mkdir -p ~/cayley && cd ~/cayley

export GOPATH=`pwd`

export PATH=$PATH:~/cayley/bin

mkdir -p bin pkg src/github.com/google

cd src/github.com/google

git clone https://github.com/google/cayley

cd cayley

go get github.com/tools/godep

godep restore

go build ./cmd/cayley

样本数据：

$ ll data

-rw-rw-r--. 1 postgres postgres 26M Jan 17 21:45 30kmoviedata.nq.gz

-rw-rw-r--. 1 postgres postgres 463 Jan 17 21:45 testdata.nq

$ gunzip 30kmoviedata.nq.gz

cayley使用帮助：

$ ./cayley --help

No command --help

Usage:

cayley COMMAND [flags]

Commands:

init Create an empty database.

load Bulk-load a quad file into the database.

http Serve an HTTP endpoint on the given host and port.

dump Bulk-dump the database into a quad file.

repl Drop into a REPL of the given query language.

version Version information.

Flags:

-alsologtostderr=false: log to standard error as well as files

-assets="": Explicit path to the HTTP assets.

-config="": Path to an explicit configuration file.

-db="memstore": Database Backend.

-dbpath="/tmp/testdb": Path to the database.

-dump="dbdump.nq": Quad file to dump the database to (".gz" supported, "-" for stdout).

-dump_type="quad": Quad file format ("json", "quad", "gml", "graphml").

-format="cquad": Quad format to use for loading ("cquad" or "nquad").

-host="127.0.0.1": Host to listen on (defaults to all).

-ignoredup=false: Don't stop loading on duplicated key on add

-ignoremissing=false: Don't stop loading on missing key on delete

-init=false: Initialize the database before using it. Equivalent to running `cayley init` followed by the given command.

-load_size=10000: Size of quadsets to load

-log_backtrace_at=:0: when logging hits line file:N, emit a stack trace

-log_dir="": If non-empty, write log files in this directory

-logstashtype="": enable logstash logging and define the type

-logstashurl="172.17.42.1:5042": logstash url and port

-logtostderr=false: log to standard error instead of files

-port="64210": Port to listen on.

-prof="": Output profiling file.

-quads="": Quad file to load before going to REPL.

-query_lang="gremlin": Use this parser as the query language.

-read_only=false: Disable writing via HTTP.

-replication="single": Replication method.

-stderrthreshold=0: logs at or above this threshold go to stderr

-timeout=30s: Elapsed time until an individual query times out.

-v=0: log level for V logs

-vmodule=: comma-separated list of pattern=N settings for file-filtered logging

假设已有一个PostgreSQL数据库。

IP : 192.168.150.132

PORT : 1921

DBNAME : postgres

USER : digoal

PWD : digoal_pwd

初始化

./cayley init -db=sql -dbpath="postgres://digoal:digoal_pwd@192.168.150.132:1921/postgres?sslmode=disable"

导入数据

./cayley load -quads="data/" -db=sql -dbpath="postgres://digoal:digoal_pwd@192.168.150.132:1921/postgres?sslmode=disable"

50亿测试数据约2TB。

开启repl或http接口服务。

./cayley repl -db=sql -dbpath="postgres://digoal:digoal_pwd@192.168.150.132:1921/postgres?sslmode=disable" -host="0.0.0.0" -port="64210"

或

./cayley http -db=sql -dbpath="postgres://digoal:digoal_pwd@192.168.150.132:1921/postgres?sslmode=disable" -host="0.0.0.0" -port="64210"

使用http接口的图例：

Query Shape：

后端是PostgreSQL时，Cayley自动将MQL或JAVASCRIPT自动转换成SQL到数据库查询，并返回结果。

对于PostgreSQL作为后端的场景，优化的手段：

1. 使用GPU加速HASH JOIN和数据扫描。

2. 使用分区表，减少无用块扫描。

3. 其他通用的PG优化手段

如果数据量大到单库的计算资源和IO资源性能支撑不住，可以用Greenplum来实现分布式查询。

查询接口：

Javascript/Gremlin API documentation

图对象

根据节点ID，检索，返回路径

路径对象

路径相交，节点匹配等

查询路径对象

数值转换，等。

[参考]

1. https://github.com/google/cayley

↧

clang编译 PostgreSQL

January 18, 2016, 3:04 am

≫ Next: perf Performance analysis tools for Linux

≪ Previous: 图数据库 CayLey + PostgreSQL

实际测试clang 3.7.1比gcc 4.9.3编译的PostgreSQL性能略好。

测试数据参考下文末尾

http://blog.163.com/digoal@126/blog/static/16387704020160941345888/

gcc安装

http://blog.163.com/digoal@126/blog/static/163877040201601313814429/

llvm, clang安装

http://blog.163.com/digoal@126/blog/static/163877040201601421045406/

CC=/u02/dege.zzz/llvm/bin/clang CFLAGS="-O2 -fstrict-enums" ./configure --prefix=/u02/digoal/soft_bak/pgsql9.5 --with-pgport=1921 --with-perl --with-python --with-tcl --with-openssl --with-pam --with-ldap --with-libxml --with-libxslt --enable-thread-safety

make world -j 32

make install-world -j 32

[参考]

http://www.kitware.com/blog/home/post/1016

http://grokbase.com/t/postgresql/pgsql-hackers/10bggd42rt/gcc-vs-clang

http://llvm.org/releases/download.html

http://www.tuicool.com/articles/Yz2Q7nz

↧

perf Performance analysis tools for Linux

January 18, 2016, 3:48 am

≫ Next: PostgreSQL 百亿地理位置数据近邻查询性能 benchmark测试

≪ Previous: clang编译 PostgreSQL

perf是基于内核子系统的一个性能分析框架，包括硬件层面(cpu/pmu)和软件级的性能分析。

man perf


PERF(1)                           perf Manual                          PERF(1)
NAME
       perf - Performance analysis tools for Linux

SYNOPSIS
       perf [--version] [--help] COMMAND [ARGS]

DESCRIPTION
       Performance counters for Linux are a new kernel-based subsystem that provide a framework for all things performance analysis. It covers hardware level (CPU/PMU, Performance Monitoring Unit) features and
       software features (software counters, tracepoints) as well.
SEE ALSO
       perf-stat(1), perf-top(1), perf-record(1), perf-report(1), perf-list(1)

分类列出可以跟踪的事件

perf list [...]

1. hw or hardware to list hardware events such as cache-misses, etc.

2. sw or software to list software events such as context switches, etc.

3. cache or hwcache to list hardware cache events such as L1-dcache-loads, etc.

4. tracepoint to list all tracepoint events, alternatively use subsys_glob:event_glob to filter by tracepoint subsystems such as sched, block, etc.

例如

#perf list hw

cpu-cycles OR cycles [Hardware event]

stalled-cycles-frontend OR idle-cycles-frontend [Hardware event]

stalled-cycles-backend OR idle-cycles-backend [Hardware event]

instructions [Hardware event]

cache-references [Hardware event]

cache-misses [Hardware event]

branch-instructions OR branches [Hardware event]

branch-misses [Hardware event]

bus-cycles [Hardware event]

#perf list sw

cpu-clock [Software event]

task-clock [Software event]

page-faults OR faults [Software event]

minor-faults [Software event]

major-faults [Software event]

context-switches OR cs [Software event]

cpu-migrations OR migrations [Software event]

alignment-faults [Software event]

emulation-faults [Software event]

#perf list cache

L1-dcache-loads [Hardware cache event]

L1-dcache-load-misses [Hardware cache event]

L1-dcache-stores [Hardware cache event]

L1-dcache-store-misses [Hardware cache event]

L1-dcache-prefetches [Hardware cache event]

L1-dcache-prefetch-misses [Hardware cache event]

L1-icache-loads [Hardware cache event]

L1-icache-load-misses [Hardware cache event]

L1-icache-prefetches [Hardware cache event]

L1-icache-prefetch-misses [Hardware cache event]

LLC-loads [Hardware cache event]

LLC-load-misses [Hardware cache event]

LLC-stores [Hardware cache event]

LLC-store-misses [Hardware cache event]

LLC-prefetches [Hardware cache event]

LLC-prefetch-misses [Hardware cache event]

dTLB-loads [Hardware cache event]

dTLB-load-misses [Hardware cache event]

dTLB-stores [Hardware cache event]

dTLB-store-misses [Hardware cache event]

dTLB-prefetches [Hardware cache event]

dTLB-prefetch-misses [Hardware cache event]

iTLB-loads [Hardware cache event]

iTLB-load-misses [Hardware cache event]

branch-loads [Hardware cache event]

branch-load-misses [Hardware cache event]

#perf list tracepoint

xfs:xfs_attr_list_sf [Tracepoint event]

xfs:xfs_attr_list_sf_all [Tracepoint event]

xfs:xfs_attr_list_leaf [Tracepoint event]

xfs:xfs_attr_list_leaf_end [Tracepoint event]

xfs:xfs_attr_list_full [Tracepoint event]

xfs:xfs_attr_list_add [Tracepoint event]

xfs:xfs_attr_list_wrong_blk [Tracepoint event]

xfs:xfs_attr_list_notfound [Tracepoint event]

xfs:xfs_attr_leaf_list [Tracepoint event]

xfs:xfs_attr_node_list [Tracepoint event]

......

perf可以用的命令

#perf

usage: perf [--version] [--help] COMMAND [ARGS]

The most commonly used perf commands are:

annotate Read perf.data (created by perf record) and display annotated code

archive Create archive with object files with build-ids found in perf.data file

bench General framework for benchmark suites

buildid-cache Manage build-id cache.

buildid-list List the buildids in a perf.data file

diff Read two perf.data files and display the differential profile

evlist List the event names in a perf.data file

inject Filter to augment the events stream with additional information

kmem Tool to trace/measure kernel memory(slab) properties

kvm Tool to trace/measure kvm guest os

list List all symbolic event types

lock Analyze lock events

record Run a command and record its profile into perf.data

report Read perf.data (created by perf record) and display the profile

sched Tool to trace/measure scheduler properties (latencies)

script Read perf.data (created by perf record) and display trace output

stat Run a command and gather performance counter statistics

test Runs sanity tests.

timechart Tool to visualize total system behavior during a workload

top System profiling tool.

每个命令的帮助

man perf-CMD 例如 man perf-top

使用最多的是perf top。类似oprofile获得的统计信息，但是oprofile可以跟踪到代码中，perf top是指令统计。

例如

以下是在创建一个GIST索引时的perf top输出。

PerfTop: 1320 irqs/sec kernel:23.3% exact: 0.0% [1000Hz cycles], (all, 32 CPUs)
-------------------------------------------------------

samples pcnt functionDSO
_______ _____ _________________________________ ________________________________________________

3528.00 31.2% gistchoose /u02/digoal/soft_bak/pgsql9.5/bin/postgres
2612.00 23.1% gist_box_penalty /u02/digoal/soft_bak/pgsql9.5/bin/postgres
543.00 4.8% FunctionCall3Coll /u02/digoal/soft_bak/pgsql9.5/bin/postgres
441.00 3.9% aliflash_reconfig_task [aliflash]
438.00 3.9% FunctionCall1Coll /u02/digoal/soft_bak/pgsql9.5/bin/postgres
378.00 3.3% hash_search_with_hash_value /u02/digoal/soft_bak/pgsql9.5/bin/postgres
234.00 2.1% isnanf /lib64/libc-2.12.so
169.00 1.5% LWLockAcquire /u02/digoal/soft_bak/pgsql9.5/bin/postgres
152.00 1.3% gistDeCompressAtt /u02/digoal/soft_bak/pgsql9.5/bin/postgres
142.00 1.3% LWLockRelease /u02/digoal/soft_bak/pgsql9.5/bin/postgres
135.00 1.2% copy_user_enhanced_fast_string [kernel.kallsyms]
115.00 1.0% gistProcessItup /u02/digoal/soft_bak/pgsql9.5/bin/postgres
106.00 0.9% _raw_spin_lock [kernel.kallsyms]
100.00 0.9% gist_box_decompress /u02/digoal/soft_bak/pgsql9.5/bin/postgres
78.00 0.7% AllocSetAlloc /u02/digoal/soft_bak/pgsql9.5/bin/postgres
69.00 0.6% _raw_spin_lock_irq [kernel.kallsyms]
66.00 0.6% PinBuffer /u02/digoal/soft_bak/pgsql9.5/bin/postgres
63.00 0.6% hash_any/u02/digoal/soft_bak/pgsql9.5/bin/postgres
58.00 0.5% gistgetadjusted /u02/digoal/soft_bak/pgsql9.5/bin/postgres
56.00 0.5% __schedule [kernel.kallsyms]
46.00 0.4% gist_box_union /u02/digoal/soft_bak/pgsql9.5/bin/postgres
45.00 0.4% __list_del_entry [kernel.kallsyms]
44.00 0.4% heap_getnext /u02/digoal/soft_bak/pgsql9.5/bin/postgres
42.00 0.4% find_get_pages [kernel.kallsyms]
42.00 0.4% ReadBuffer_common /u02/digoal/soft_bak/pgsql9.5/bin/postgres
41.00 0.4% UnpinBuffer /u02/digoal/soft_bak/pgsql9.5/bin/postgres
40.00 0.4% update_lunset_sq_head[aliflash]
37.00 0.3% huge_pte_offset [kernel.kallsyms]
37.00 0.3% memcpy /lib64/libc-2.12.so
33.00 0.3% dm_blk_close /lib/modules/3.18.24/kernel/drivers/md/dm-mod.ko
32.00 0.3% __memcmp_sse4_1 /lib64/libc-2.12.so
30.00 0.3% gistPushItupToNodeBuffer /u02/digoal/soft_bak/pgsql9.5/bin/postgres
30.00 0.3% _raw_spin_lock_irqsave [kernel.kallsyms]
28.00 0.2% pg_qsort/u02/digoal/soft_bak/pgsql9.5/bin/postgres
28.00 0.2% _raw_spin_lock_bh [kernel.kallsyms]
26.00 0.2% slot_deform_tuple /u02/digoal/soft_bak/pgsql9.5/bin/postgres
26.00 0.2% xfs_alloc_ioend /lib/modules/3.18.24/kernel/fs/xfs/xfs.ko
25.00 0.2% hash_uint32 /u02/digoal/soft_bak/pgsql9.5/bin/postgres
25.00 0.2% xfs_fs_geometry /lib/modules/3.18.24/kernel/fs/xfs/xfs.ko
22.00 0.2% palloc /u02/digoal/soft_bak/pgsql9.5/bin/postgres
22.00 0.2% __block_commit_write [kernel.kallsyms]
22.00 0.2% __switch_to [kernel.kallsyms]
22.00 0.2% __random_r /lib64/libc-2.12.so
21.00 0.2% LockBuffer /u02/digoal/soft_bak/pgsql9.5/bin/postgres
21.00 0.2% apic_timer_interrupt [kernel.kallsyms]
21.00 0.2% lapic_next_deadline [kernel.kallsyms]
20.00 0.2% gist_box_same /u02/digoal/soft_bak/pgsql9.5/bin/postgres

其他诊断工具：

oprofile

http://blog.163.com/digoal@126/blog/static/163877040201549115140794/

systemtap

blktrace

iowatcher

iotop

↧

PostgreSQL 百亿地理位置数据近邻查询性能 benchmark测试

January 18, 2016, 6:56 pm

≫ Next: PostgreSQL TPC-C极限优化玩法总结

≪ Previous: perf Performance analysis tools for Linux

本文主要要展示的是PostgreSQL在位置信息近邻（KNN）查询方面的性能。

测试类型point，索引类型GiST。（PostGIS同样支持KNN查询，性能和本文的测试差不多）

测试数据量大于100亿。

测试环境和优化请参考：

http://blog.163.com/digoal@126/blog/static/16387704020160941345888/

创建测试表


postgres=# create table tbl_point(id serial8, poi point);
CREATE TABLE
postgres=# \d tbl_point
                      Table "benchmarksql.tbl_point"
 Column |  Type  |                       Modifiers                        
--------+--------+--------------------------------------------------------
 id     | bigint | not null default nextval('tbl_point_id_seq'::regclass)
 poi    | point  | 

postgres=# alter sequence tbl_point_id_seq cache 10000;
ALTER SEQUENCE

使用以下脚本生成测试数据：

point的x和y的取值范围都是-50000到50000，所以一共可以生成100亿个不同的point。

和测试数据量相符。


vi test.sql
insert into tbl_point(poi) select point(trunc(100000*(0.5-random())), trunc(100000*(0.5-random()))) from generate_series(1,10000);

使用pgbench每秒约插入233万位置信息。


pgbench -M prepared -n -r -f ./test.sql -P 1 -c 96 -j 96 -T 1100
tps = 233.018365 (including connections establishing)
tps = 233.150940 (excluding connections establishing)

数据量


postgres=# select count(*) from tbl_point;
   count    
------------
 2532820000
(1 row)

当前表大小：


postgres=# \dt+
 benchmarksql | tbl_point  | table | postgres | 123 GB   |

在point类型上创建GiST索引


postgres=# create index idx_tbl_point on tbl_point using gist(poi) with (buffering=on);

postgres=# \d+ tbl_point
                                         Table "benchmarksql.tbl_point"
 Column |  Type  |                       Modifiers                        | Storage | Stats target | Description 
--------+--------+--------------------------------------------------------+---------+--------------+-------------
 id     | bigint | not null default nextval('tbl_point_id_seq'::regclass) | plain   |              | 
 poi    | point  |                                                        | plain   |              | 
Indexes:
    "idx_tbl_point" gist (poi) WITH (buffering='on')

索引大小：


\di+
 benchmarksql | idx_tbl_point      | index | postgres | tbl_point  | 170 GB  |

新建完索引后，插入性能会下降，现在每秒约插入55万条位置信息。


pgbench -M prepared -n -r -f ./test.sql -P 1 -c 96 -j 96 -T 100
transaction type: Custom query
scaling factor: 1
query mode: prepared
number of clients: 96
number of threads: 96
duration: 100 s
number of transactions actually processed: 5587
latency average: 1726.947 ms
latency stddev: 118.223 ms
tps = 55.390665 (including connections establishing)
tps = 55.419003 (excluding connections establishing)
statement latencies in milliseconds:
        1726.946947     insert into tbl_point(poi) select point(trunc(100000*(0.5-random())), trunc(100000*(0.5-random()))) from generate_series(1,10000);

根据这个速度，持续插入13600秒，超过100亿记录后停止。

pgbench -M prepared -n -r -f ./test.sql -P 1 -c 64 -j 64 -T 13600

表

500GB

索引

720GB

knn检索例子：


postgres=# select *,poi <-> point(1000,1000) dist from tbl_point where poi <-> point(1000,1000) < 100 order by poi <-> point(1000,1000) limit 10;
     id     |     poi     |       dist       
------------+-------------+------------------
  399588621 | (1000,999)  |                1
 1030719903 | (1001,999)  |  1.4142135623731
 2698052191 | (1001,1001) |  1.4142135623731
 3291219762 | (999,999)   |  1.4142135623731
 2757190006 | (1002,1000) |                2
 2862610530 | (998,1001)  | 2.23606797749979
 3450459141 | (998,1001)  | 2.23606797749979
 3124756442 | (1002,1001) | 2.23606797749979
 3105439886 | (1001,998)  | 2.23606797749979
  473144305 | (998,1002)  | 2.82842712474619
(10 rows)

执行计划，排序和检索都走了GiST索引。

例如，一个查询如下，扫描了16个数据块，在shared buffer命中8个，读8个块(可能在OS CACHE或直接读block dev)。


postgres=# explain (analyze,verbose,buffers,timing,costs) select *,poi <-> point(10090,10090) dist from tbl_point where poi <-> point(10090,10090) < 100 order by poi <-> point(10090,10090) limit 10;
                                                                           QUERY PLAN                                                                           
----------------------------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=0.56..13.15 rows=10 width=24) (actual time=0.469..1.309 rows=10 loops=1)
   Output: id, poi, ((poi <-> '(10090,10090)'::point))
   Buffers: shared hit=8 read=8 dirtied=1
   ->  Index Scan using idx_tbl_point on benchmarksql.tbl_point  (cost=0.56..1510464450.86 rows=1199422376 width=24) (actual time=0.468..1.306 rows=10 loops=1)
         Output: id, poi, (poi <-> '(10090,10090)'::point)
         Order By: (tbl_point.poi <-> '(10090,10090)'::point)
         Filter: ((tbl_point.poi <-> '(10090,10090)'::point) < '100'::double precision)
         Buffers: shared hit=8 read=8 dirtied=1
 Planning time: 0.084 ms
 Execution time: 1.347 ms
(10 rows)

下面是压力测试，测试在100亿地理位置数据中，postgresql 的knn查询性能。

测试脚本如下，随机生成一个point，然后查找这个point附近距离为100以内，按距离排序，取出1条。


vi test.sql
\setrandom x -50000 50000
\setrandom y -50000 50000
select * from tbl_point where poi <-> point(:x,:y) <100 order by poi <-> point(:x,:y) limit 1;

测试结果：

单次请求的平均响应时间为0.858毫秒。

pgbench -M prepared -n -r -f ./test.sql -P 1 -c 64 -j 64 -T 100

transaction type: Custom query

scaling factor: 1

query mode: prepared

number of clients: 64

number of threads: 64

duration: 100 s

number of transactions actually processed: 7418337

latency average: 0.858 ms

latency stddev: 0.564 ms

tps = 74151.604194 (including connections establishing)

tps = 74184.255934 (excluding connections establishing)

statement latencies in milliseconds:

0.007518 \setrandom x -50000 50000

0.002193 \setrandom y -50000 50000

0.847847 select * from tbl_point where poi <-> point(:x,:y) <100 order by poi <-> point(:x,:y) limit 1;

因为数据量较大，内存加索引超过1TB，远远超越了内存大小，要做到0.858毫秒的响应，得益于AliFlash pci-E SSD卡的性能，单次请求平均0.01毫秒完成。队列等待0.19毫秒。

avg-cpu: %user %nice %system %iowait %steal %idle

69.54 0.00 24.11 5.87 0.00 0.47

Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util

dfa 0.00 0.00 26100.00 2096.00 417600.00 33536.00 16.00 5.05 0.18 0.03 98.00

dfb 0.00 0.00 26150.00 2038.00 418400.00 32600.00 16.00 5.01 0.18 0.03 98.40

dfc 0.00 0.00 25931.00 2026.00 414896.00 32384.00 16.00 6.15 0.22 0.04 99.70

dm-0 0.00 0.00 78178.00 6160.00 1250848.00 98520.00 16.00 16.73 0.19 0.01 101.00

[其他优化手段]

1. http://blog.163.com/digoal@126/blog/static/16387704020137610534650/

当请求的数据距离不在给予范围内时，被扫描的GiST索引PAGE会被放大，所以优化手段可以先order by limit，然后再过滤举例满足条件的。

例子

这个查询可能要跑很久很久才能出结果，并且结果可能是0记录。


explain (analyze,verbose,buffers,timing,costs) select *,poi <-> point(10090,10090000) dist from tbl_point where poi <-> point(10090,10090000) < 100 order by poi <-> point(10090,10090000) limit 10;

优化手段，先order by limit，然后再过滤举例满足条件的。


postgres=# explain (analyze,verbose,buffers,timing,costs) select * from (select *,poi <-> point(10090,10090000) dist from tbl_point order by poi <-> point(10090,10090000) limit 1000 ) t where poi <-> point(10090,10090000) < 100 limit 10; 
                                                                                  QUERY PLAN                                                                                  
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=0.56..13.51 rows=10 width=32) (actual time=3.769..3.769 rows=0 loops=1)
   Output: t.id, t.poi, t.dist
   Buffers: shared hit=1174
   ->  Subquery Scan on t  (cost=0.56..432.00 rows=333 width=32) (actual time=3.769..3.769 rows=0 loops=1)
         Output: t.id, t.poi, t.dist
         Filter: ((t.poi <-> '(10090,10090000)'::point) < '100'::double precision)
         Rows Removed by Filter: 1000
         Buffers: shared hit=1174
         ->  Limit  (cost=0.56..417.00 rows=1000 width=24) (actual time=0.106..3.596 rows=1000 loops=1)
               Output: tbl_point.id, tbl_point.poi, ((tbl_point.poi <-> '(10090,10090000)'::point))
               Buffers: shared hit=1174
               ->  Index Scan using idx_tbl_point on benchmarksql.tbl_point  (cost=0.56..1498470227.10 rows=3598267127 width=24) (actual time=0.105..3.505 rows=1000 loops=1)
                     Output: tbl_point.id, tbl_point.poi, (tbl_point.poi <-> '(10090,10090000)'::point)
                     Order By: (tbl_point.poi <-> '(10090,10090000)'::point)
                     Buffers: shared hit=1174
 Planning time: 0.069 ms
 Execution time: 3.793 ms
(17 rows)

PostGIS也是这么玩的：


digoal=# select * from (select *,ST_Distance(jwd, ST_Transform(ST_GeomFromText('POINT(120.19 30.26)', 4326), 2163)) AS dist from cust_jw order by jwd <-> ST_Transform(ST_GeomFromText('POINT(120.19 30.26)', 4326), 2163) limit 1000) t where dist<15000;
  dz  |                        jwd                         |       dist       
------+----------------------------------------------------+------------------
 杭州 | 0101000020730800004C94087D5D4F54C173AA7759E8FB5D41 |                0
 余杭 | 0101000020730800000E6E5A20494854C121FC688DA9EF5D41 | 14483.9823187612
(2 rows)
Time: 0.634 ms

更小气的玩法如下，这是为节约资源优化到极致的方法，使用游标解决以上问题，最多多一个PAGE的扫描。


digoal=# do language plpgsql $$
declare
  v_rec record;
  v_limit int := 1000;
begin
  set enable_seqscan=off;  -- 强制索引, 因为扫描行数够就退出.
  for v_rec in select *,ST_Distance(jwd, ST_Transform(ST_GeomFromText('POINT(120.19 30.26)', 4326), 2163)) AS dist from cust_jw order by jwd <-> ST_Transform(ST_GeomFromText('POINT(120.19 30.26)', 4326), 2163) loop
    if v_limit <=0 then 
      raise notice '已经取足数据';
      return;
    end if;
    if v_rec.dist > 20000 then 
      raise notice '满足条件的点已输出完毕';
      return;
    else
      raise notice 'do someting, v_rec:%', v_rec;
    end if;
    v_limit := v_limit -1;
  end loop;
end;
$$;
NOTICE:  do someting, v_rec:(杭州,0101000020730800004C94087D5D4F54C173AA7759E8FB5D41,0)
NOTICE:  do someting, v_rec:(余杭,0101000020730800000E6E5A20494854C121FC688DA9EF5D41,14483.9823187612)
NOTICE:  满足条件的点已输出完毕
DO

[参考]

1. http://www.postgresql.org/docs/9.5/static/gist-intro.html

↧

PostgreSQL TPC-C极限优化玩法总结

January 21, 2016, 3:11 am

≫ Next: ipad变白板，很好的在线视频讲课工具。

≪ Previous: PostgreSQL 百亿地理位置数据近邻查询性能 benchmark测试

[md格式]

https://github.com/digoal/pgsql_admin_script/blob/master/pgsql_perf_tuning.md

PostgreSQL TPC-C极限优化玩法

digoal

2016-01-19

简介

本文以工业界测试模型TPmC为测试模型，介绍PostgreSQL数据库从系统层面的优化到数据库层面的优化方法。
TPmC从 256195.32 提升到 606466.31 是如何做到的。

测试环境介绍

16核开HT共32线程，
256G 1600MHz 内存，
万兆网卡，
3 块 6.4TB AliFlash PCI-E SSD，
逻辑卷条带，
XFS，
数据块对齐。

XFS文件系统优化

主要分3块，
1. 逻辑卷优化部分
2. XFS mkfs 优化部分
3. XFS mount 优化部分
以上几个部分都可以通过man手册查看，了解原理和应用场景后着手优化。
man lvcreate
man xfs
man mkfs.xfs
man mount

逻辑卷优化部分

1.1 创建PV前，将块设备对齐（对齐的目的是避免双写，因为SSD有最小写入单元，如果没有对齐，可能出现SSD写多个块），前面1MB最好不要分配，从2048 sector开始分配。
（使用pvcreate的--dataalignment参数也可以达到同样的目的。）
fdisk -c -u /dev/dfa
start 2048
end + (2048*n) - 1
或者使用parted创建分区。

LVM的layout
https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/7/html/Logical_Volume_Manager_Administration/LVM_components.html#pv_illustration
创建PV时，也需要对齐DATA的数据。
从4MB处开始分配DATA EXTENSION：
# pvcreate --dataalignment 4M /dev/sdc

1st PE 即数据开始位置。

[root@digoal ~]# pvs -o+pe_start  
  PV         VG     Fmt  Attr PSize  PFree  1st PE   
  /dev/sda2  centos lvm2 a--  19.51g 40.00m   1.00m  
  /dev/sdc          lvm2 ---  20.00g 20.00g   4.00m

列出所有可以查看的flag
pvs -o+

1.2 创建lv主要指定2个参数，
条带数量，和pv数量一致即可，如果PV本身是一个RAID设备，根据RAID的块设备个数来定条带数。
例如RAID5 5块盘，去除1个校验数据，取4作为条带数。RAID10 10块盘，取5作为条带数。RAID0 10块盘，取10作为条带数。

       -i, --stripes Stripes  
  Gives the number of stripes.  This is equal to the number of physical volumes to scatter the logical volume.

条带大小，和数据库块大小一致，例如postgresql默认为 8KB。

       -I, --stripesize StripeSize  
  Gives the number of kilobytes for the granularity of the stripes.  
  StripeSize must be 2^n (n = 2 to 9) for metadata in LVM1 format.  For metadata in LVM2 format, the stripe size may be a larger power of 2 but must not exceed the physical extent size.

创建快照时，指定的参数
chunksize, 最好和数据库的块大小一致, 例如postgresql默认为 8KB。

       -c, --chunksize ChunkSize  
  Power of 2 chunk size for the snapshot logical volume between 4k and 512k.

例如：
预留2GB给xfs的LOG DEV

#lvcreate -i 3 -I 8 -n lv02 -L 2G vgdata01  
  Logical volume "lv02" created  
#lvcreate -i 3 -I 8 -n lv01 -l 100%FREE vgdata01  
  Logical volume "lv01" created  
#lvs  
  LV   VG       Attr   LSize   Origin Snap%  Move Log Copy%  Convert  
  lv01 vgdata01 -wi-a-  17.29t    
  lv02 vgdata01 -wi-a-  2g

XFS mkfs 优化部分

首先要搞清楚XFS的layout。

xfs包含3个section，data, log, realtime files。
默认情况下 log存在data里面，没有realtime。所有的section都是由最小单位block组成，初始化xfs是-b指定block size。

2.1 data
包含 metadata(inode, 目录, 间接块), user file data, non-realtime files
data被拆分成多个allocation group，mkfs.xfs时可以指定group的个数，以及单个group的SIZE。
group越多，可以并行进行的文件和块的allocation就越多。你可以认为单个组的操作是串行的，多个组是并行的。
但是组越多，消耗的CPU会越多，需要权衡。对于并发写很高的场景，可以多一些组，（例如一台主机跑了很多小的数据库，每个数据库都很繁忙的场景下）

2.2 log
存储metadata的log，修改metadata前，必须先记录log，然后才能修改data section中的metadata。
也用于crash后的恢复。

2.3 realtime
被划分为很多个小的extents, 要将文件写入到realtime section中，必须使用xfsctl改一下文件描述符的bit位，并且一定要在数据写入前完成。在realtime中的文件大小是realtime extents的倍数关系。

mkfs.xfs优化

对于data section：
allocation group count数量和AGSIZE相乘等于块设备大小。
AG count数量多少和用户需求的并行度相关。
同时AG SIZE的取值范围是16M到1TB，PostgreSQL 建议1GB左右。
-b size=8192 与数据库块大小一致（但不是所有的xfs版本都支持大于4K的block size，所以如果你发现mount失败并且告知只支持4K以下的BLOCK，那么请重新格式化）
-d agcount=9000,sunit=16,swidth=48
假设有9000个并发写操作，使用9000个allocation groups
(单位512 bytes) 与lvm或RAID块设备的条带大小对齐
与lvm或RAID块设备条带跨度大小对齐，以上对应3*8 例如 -i 3 -I 8。

log section：
最好放在SSD上，速度越快越好。最好不要使用cgroup限制LOG块设备的iops操作。

realtime section:
不需要的话，不需要创建。

agsize绝对不能是条带宽度的倍数。(假设条带数为3，条带大小为8K，则宽度为24K。)
如果根据指定agcount算出的agsize是swidth的倍数，会弹出警告：
例如下面的例子，
agsize=156234 blks 是 swidth=6 blks 的倍数 26039。
给出的建议是减掉一个stripe unit即8K，即156234 blks - sunit 2 blks = 156232 blks。
156232 blks换算成字节数= 1562324096 = 639926272 bytes 或 1562324 = 624928K

#mkfs.xfs -f -b size=4096 -l logdev=/dev/mapper/vgdata01-lv01,size=2136997888,sunit=16 -d agcount=30000,sunit=16,swidth=48 /dev/mapper/vgdata01-lv02
Warning: AG size is a multiple of stripe width.  This can cause performance
problems by aligning all AGs on the same disk.  To avoid this, run mkfs with
an AG size that is one stripe unit smaller, for example 156232.
meta-data=/dev/mapper/vgdata01-lv02 isize=256    agcount=30000, agsize=156234 blks
         =                       sectsz=4096  attr=2, projid32bit=1
         =                       crc=0        finobt=0
data     =                       bsize=4096   blocks=4686971904, imaxpct=5
         =                       sunit=2      swidth=6 blks
naming   =version 2              bsize=4096   ascii-ci=0 ftype=0
log      =/dev/mapper/vgdata01-lv01 bsize=4096   blocks=521728, version=2
         =                       sectsz=512   sunit=2 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0

对于上面这个mkfs.xfs操作，改成以下

#mkfs.xfs -f -b size=4096 -l logdev=/dev/mapper/vgdata01-lv01,size=2136997888,sunit=16 -d agsize=639926272,sunit=16,swidth=48 /dev/mapper/vgdata01-lv02

或

#mkfs.xfs -f -b size=4096 -l logdev=/dev/mapper/vgdata01-lv01,size=2136997888,sunit=16 -d agsize=624928k,sunit=16,swidth=48 /dev/mapper/vgdata01-lv02

输出如下

meta-data=/dev/mapper/vgdata01-lv02 isize=256    agcount=30001, agsize=156232 blks  (约600MB)
         =                       sectsz=4096  attr=2, projid32bit=1
         =                       crc=0        finobt=0
data     =                       bsize=4096   blocks=4686971904, imaxpct=5
         =                       sunit=2      swidth=6 blks
naming   =version 2              bsize=4096   ascii-ci=0 ftype=0
log      =/dev/mapper/vgdata01-lv01 bsize=4096   blocks=521728, version=2
         =                       sectsz=512   sunit=2 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0

XFS mount 优化部分

nobarrier
largeio 针对数据仓库，流媒体这种大量连续读的应用
nolargeio 针对OLTP
logbsize=262144 指定 log buffer
logdev= 指定log section对应的块设备，用最快的SSD。
noatime,nodiratime
swalloc 条带对齐
allocsize=16M delayed allocation writeout的buffer io大小
inode64 Indicates that XFS is allowed to create inodes at any location in the filesystem

mount.xfs 例子

#mount -t xfs -o allocsize=16M,inode64,nobarrier,nolargeio,logbsize=262144,noatime,nodiratime,swalloc,logdev=/dev/mapper/vgdata01-lv02 /dev/mapper/vgdata01-lv01 /data01

xfsctl 优化部分

略

排错

#mount -o noatime,swalloc /dev/mapper/vgdata01-lv01 /data01  
mount: Function not implemented

原因是用了不支持的块大小

[ 5736.642924] XFS (dm-0): File system with blocksize 8192 bytes. Only pagesize (4096) or less will currently work.  
[ 5736.695146] XFS (dm-0): SB validate failed with error -38.

排除

# mkfs.xfs -f -b size=4096 -l logdev=/dev/mapper/vgdata01-lv02,size=2136997888,sunit=16 -d agcount=9000,sunit=16,swidth=48 /dev/mapper/vgdata01-lv01   

meta-data=/dev/mapper/vgdata01-lv01 isize=256    agcount=9000, agsize=515626 blks  
         =           sectsz=512   attr=2  
data     =           bsize=4096   blocks=4640621568, imaxpct=5  
         =           sunit=2      swidth=6 blks  
naming   =version 2  bsize=4096   ascii-ci=0  
log      =/dev/mapper/vgdata01-lv02 bsize=4096   blocks=521728, version=2  
         =           sectsz=512   sunit=2 blks, lazy-count=1  
realtime =none       extsz=4096   blocks=0, rtextents=0

mount时指定logdev

#mount -t xfs -o allocsize=16M,inode64,nobarrier,nolargeio,logbsize=262144,noatime,nodiratime,swalloc,logdev=/dev/mapper/vgdata01-lv02 /dev/mapper/vgdata01-lv01 /data01

安装benchmarksql

http://sourceforge.net/projects/benchmarksql/

下载安装 JDK7

http://www.oracle.com/technetwork/cn/java/javase/downloads/jdk7-downloads-1880260.html  
wget http://download.oracle.com/otn-pub/java/jdk/7u79-b15/jdk-7u79-linux-x64.rpm  
rpm -ivh jdk-7u79-linux-x64.rpm

检查包安装位置(使用rpm安装时也可以直接指定位置)

rpm -ql jdk  
...  
/usr/java/jdk1.7.0_79/bin/java  
...

配置JAVA环境变量

$  export JAVA_HOME=/usr/java/jdk1.7.0_79    
$  export PATH=$JAVA_HOME/bin:$PATH    
$  export CLASSPATH=.:$CLASSPATH

下载最新java版本对应的postgresql jdbc jar

wget https://jdbc.postgresql.org/download/postgresql-9.4.1207.jre7.jar  
mv postgresql-9.4.1207.jre7.jar benchmarksql-4.1.0/lib/

配置benchmarksql，使用新的postgresql java驱动

$ vi runBenchmark.sh   
java -cp .:../lib/postgresql-9.4.1207.jre7.jar:../lib/log4j-1.2.17.jar:../lib/apache-log4j-extras-1.1.jar:../dist/BenchmarkSQL-4.1.jar -Dprop=$1 jTPCC  

$ vi runLoader.sh  
java -cp .:../lib/postgresql-9.4.1207.jre7.jar:../dist/BenchmarkSQL-4.1.jar -Dprop=$1 LoadData $2 $3 $4 $5  

$ vi runSQL.sh   
myCP="../lib/postgresql-9.4.1207.jre7.jar"  
myCP="$myCP:../dist/BenchmarkSQL-4.1.jar"  

myOPTS="-Dprop=$1"  
myOPTS="$myOPTS -DcommandFile=$2"  

java -cp .:$myCP $myOPTS ExecJDBC

修改log4j，减少日志打印量。priority改成info，只输出最终结果，不输出产生订单的日志。

$ vi log4j.xml  
<?xml version="1.0" encoding="UTF-8" ?>  
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">  
<log4j:configuration xmlns:log4j='http://jakarta.apache.org/log4j/'>  

<appender name="console" class="org.apache.log4j.ConsoleAppender">  
<param name="Threshold" value="info"/>  
<layout class="org.apache.log4j.PatternLayout">  
<param name="ConversionPattern" value="%d %5p - %m%n"/>  
</layout>  
</appender>  

<appender name="R" class="org.apache.log4j.rolling.RollingFileAppender">  
<param name="Append" value="True" />  
<rollingPolicy class="org.apache.log4j.rolling.TimeBasedRollingPolicy">  
<param name="FileNamePattern" value="log/archive/benchmarksql.%d{yyyyMMddHHmm}.log"/>  
<param name="ActiveFileName" value="log/benchmarksql.log"/>  
</rollingPolicy>  
<triggeringPolicy class="org.apache.log4j.rolling.SizeBasedTriggeringPolicy">  
<param name="MaxFileSize" value="1"/>  
</triggeringPolicy>  
<layout class="org.apache.log4j.PatternLayout">  
<param name="ConversionPattern" value="%5p\t[%d{yyyy-MM-dd HH:mm:ss.SSS}]\t%t \t%m%n"/>  
</layout>  
<filter class="org.apache.log4j.filter.StringMatchFilter">  
<param name="StringToMatch" value ="\n" />  
<param name="AcceptOnMatch" value="false" />  
</filter>  
</appender>  

<appender name="E" class="org.apache.log4j.rolling.RollingFileAppender">  
<param name="Append" value="True" />  
<param name="Threshold" value="warn"/>  
<rollingPolicy class="org.apache.log4j.rolling.TimeBasedRollingPolicy">  
<param name="FileNamePattern" value="log/BenchmarkSQLError.%d.log"/>  
<param name="ActiveFileName" value="log/BenchmarkSQLError.log"/>  
</rollingPolicy>  
<layout class="org.apache.log4j.PatternLayout">  
<param name="ConversionPattern" value="%5p\t[%d{yyyy-MM-dd HH:mm:ss.SSS}]\t%t \t%m%n"/>  
</layout>  
</appender>  

<root>  
<priority value="info"/>  
<appender-ref ref="R"/>  
<appender-ref ref="E"/>  
</root>  

</log4j:configuration>

系统配置优化

内核配置  
/etc/grub.conf  
numa=off  
elevator=deadline  

编译器版本  
gcc version 4.4.6 20110731 (Red Hat 4.4.6-3) (GCC)   

/etc/sysctl.conf  
vm.swappiness = 0  
kernel.shmmax=135497418752  
net.core.rmem_max = 4194304  
net.core.wmem_max = 4194304  
net.core.rmem_default = 262144  
net.core.wmem_default = 262144  
net.ipv4.ip_local_port_range = 9000 65535  
kernel.sem = 50100 64128000 50100 1280  
vm.dirty_background_bytes = 102400000  
vm.dirty_ratio = 80  
vm.nr_hugepages = 102352  

/etc/security/limits.conf  
* soft nofile 655360  
* hard nofile 655360  
* soft nproc 655360  
* hard nproc 655360  
* soft stack unlimited  
* hard stack unlimited  
* soft   memlock    250000000  
* hard   memlock    250000000  

块设备预读  
blockdev --setra 16384 /dev/dfa  
blockdev --setra 16384 /dev/dfb  
blockdev --setra 16384 /dev/dfc  
blockdev --setra 16384 /dev/dm-0

安装PostgreSQL

PostgreSQL编译项

./configure --prefix=/u02/digoal/soft_bak/pgsql9.5 --with-blocksize=8 --with-pgport=1921 --with-perl --with-python --with-tcl --with-openssl --with-pam --with-ldap --with-libxml --with-libxslt --enable-thread-safety  
gmake world -j32  
gmake install-world -j32

配置postgres环境变量

$ vi env_pg.sh   
export PS1="$USER@`/bin/hostname -s`-> "  
export PGPORT=1921  
export PGDATA=/data01/pgdata/pg_root  
export LANG=en_US.utf8  
export PGHOME=/u02/digoal/soft_bak/pgsql9.5  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGHOST=$PGDATA  
export PGDATABASE=postgres  
export PGUSER=postgres  
alias rm='rm -i'  
alias ll='ls -lh'  
unalias vi

配置postgresql.conf

$ vi $PGDATA/postgresql.conf  
port = 1921     # (change requires restart)  
max_connections = 300       # (change requires restart)  
unix_socket_directories = '.'   # comma-separated list of directories  
shared_buffers = 32GB       # min 128kB  
huge_pages = try           # on, off, or try  
maintenance_work_mem = 2GB  # min 1MB  
dynamic_shared_memory_type = posix      # the default is the first option  
bgwriter_delay = 10ms       # 10-10000ms between rounds  
wal_level = minimal  # minimal, archive, hot_standby, or logical  
synchronous_commit = off    # synchronization level;  
full_page_writes = off      # recover from partial page writes, 有备份和归档就可以关闭它, crash后从备份恢复, 放partial write    
wal_buffers = 16MB           # min 32kB, -1 sets based on shared_buffers  
wal_writer_delay = 10ms         # 1-10000 milliseconds  
max_wal_size = 32GB  
effective_cache_size = 240GB  
log_destination = 'csvlog'  # Valid values are combinations of  
logging_collector = on          # Enable capturing of stderr and csvlog  
log_truncate_on_rotation = on           # If on, an existing log file with the

编辑benchmarksql连接配置和压测配置

1000 个仓库，约5亿数据量。

$ vi props.pg   
driver=org.postgresql.Driver  
conn=jdbc:postgresql://localhost:1921/postgres  
user=postgres  
password=123  

warehouses=1000  
terminals=96  
//To run specified transactions per terminal- runMins must equal zero  
runTxnsPerTerminal=0  
//To run for specified minutes- runTxnsPerTerminal must equal zero  
runMins=1  
//Number of total transactions per minute  
limitTxnsPerMin=0  

//The following five values must add up to 100  
//The default percentages of 45, 43, 4, 4 & 4 match the TPC-C spec  
newOrderWeight=40  
paymentWeight=36  
orderStatusWeight=8  
deliveryWeight=8  
stockLevelWeight=8

生成测试数据

配置postgres用户默认搜索路径

$ psql  
psql (9.5.0)  
Type "help" for help.  
postgres=# alter role postgres set search_path='benchmarksql','public';

创建用于存放生成CSV的目录

$ mkdir /u02/digoal/soft_bak/benchcsv

修改benchmarksql sqlTableCopies，指定目录

$ vi sqlTableCopies   

copy benchmarksql.warehouse  
  (w_id, w_ytd, w_tax, w_name, w_street_1, w_street_2, w_city, w_state, w_zip)    
  from '/u02/digoal/soft_bak/benchcsv/warehouse.csv' WITH CSV;  

copy benchmarksql.item  
  (i_id, i_name, i_price, i_data, i_im_id)   
  from '/u02/digoal/soft_bak/benchcsv/item.csv' WITH CSV;  

copy benchmarksql.stock  
  (s_i_id, s_w_id, s_quantity, s_ytd, s_order_cnt, s_remote_cnt, s_data,  
   s_dist_01, s_dist_02, s_dist_03, s_dist_04, s_dist_05,  
   s_dist_06, s_dist_07, s_dist_08, s_dist_09, s_dist_10)  
  from '/u02/digoal/soft_bak/benchcsv/stock.csv' WITH CSV;  

copy benchmarksql.district  
  (d_id, d_w_id, d_ytd, d_tax, d_next_o_id, d_name, d_street_1,  
   d_street_2, d_city, d_state, d_zip)   
  from '/u02/digoal/soft_bak/benchcsv/district.csv' WITH CSV;  

copy benchmarksql.customer  
  (c_id, c_d_id, c_w_id, c_discount, c_credit, c_last, c_first, c_credit_lim,   
   c_balance, c_ytd_payment, c_payment_cnt, c_delivery_cnt, c_street_1,   
   c_street_2, c_city, c_state, c_zip, c_phone, c_since, c_middle, c_data)   
  from '/u02/digoal/soft_bak/benchcsv/customer.csv' WITH CSV;  

copy benchmarksql.history  
  (hist_id, h_c_id, h_c_d_id, h_c_w_id, h_d_id, h_w_id, h_date, h_amount, h_data)   
  from '/u02/digoal/soft_bak/benchcsv/cust-hist.csv' WITH CSV;  

copy benchmarksql.oorder  
  (o_id, o_w_id, o_d_id, o_c_id, o_carrier_id, o_ol_cnt, o_all_local, o_entry_d)   
  from '/u02/digoal/soft_bak/benchcsv/order.csv' WITH CSV;  

copy benchmarksql.order_line  
  (ol_w_id, ol_d_id, ol_o_id, ol_number, ol_i_id, ol_delivery_d,   
   ol_amount, ol_supply_w_id, ol_quantity, ol_dist_info)   
  from '/u02/digoal/soft_bak/benchcsv/order-line.csv' WITH CSV;  

copy benchmarksql.new_order  
  (no_w_id, no_d_id, no_o_id)    
  from '/u02/digoal/soft_bak/benchcsv/new-order.csv' WITH CSV;

建立表结构

$ cd benchmarksql-4.1.0/run  
$ ./runSQL.sh props.pg sqlTableCreates

生成CSV

$ ./runLoader.sh props.pg numWarehouses 1000 fileLocation /u02/digoal/soft_bak/benchcsv/

1000个仓库的数据量：

total 69G  
-rw-r--r-- 1 digoal users 2.0G Jan  9 15:53 cust-hist.csv  
-rw-r--r-- 1 digoal users  16G Jan  9 15:53 customer.csv  
-rw-r--r-- 1 digoal users 898K Jan  9 15:12 district.csv  
-rw-r--r-- 1 digoal users 7.0M Jan  9 14:22 item.csv  
-rw-r--r-- 1 digoal users  95M Jan  9 16:14 new-order.csv  
-rw-r--r-- 1 digoal users 1.3G Jan  9 16:14 order.csv  
-rw-r--r-- 1 digoal users  22G Jan  9 16:14 order-line.csv  
-rw-r--r-- 1 digoal users  28G Jan  9 15:12 stock.csv  
-rw-r--r-- 1 digoal users  84K Jan  9 14:22 warehouse.csv

导入数据库

$ ./runSQL.sh props.pg sqlTableCopies

创建约束和索引

$ ./runSQL.sh props.pg sqlIndexCreates

备份

$ pg_dump -f /u02/digoal/soft_bak/benchmarksql.dmp -F c -n benchmarksql postgres

阶段1 TPC-C 压测

nohup ./runBenchmark.sh props.pg >/dev/null 2>./errrun.log &

测试结果

 INFO   [2016-01-09 22:03:39.961]       Thread-7        Term-00,   
 INFO   [2016-01-09 22:03:39.963]       Thread-7        Term-00,   
 INFO   [2016-01-09 22:03:39.963]       Thread-7        Term-00, Measured tpmC (NewOrders) = 102494.46  
 INFO   [2016-01-09 22:03:39.963]       Thread-7        Term-00, Measured tpmTOTAL = 256195.32  
 INFO   [2016-01-09 22:03:39.964]       Thread-7        Term-00, Session Start     = 2016-01-09 21:53:39  
 INFO   [2016-01-09 22:03:39.964]       Thread-7        Term-00, Session End       = 2016-01-09 22:03:39  
 INFO   [2016-01-09 22:03:39.964]       Thread-7        Term-00, Transaction Count = 2563088

主机信息，截取压测第9分钟的数据。

TOP  
top - 22:02:09 up 3 days, 12:55,  3 users,  load average: 19.23, 15.97, 8.37  
Tasks: 619 total,  10 running, 609 sleeping,   0 stopped,   0 zombie  
Cpu(s): 35.0%us,  9.4%sy,  0.0%ni, 52.6%id,  0.1%wa,  0.0%hi,  2.9%si,  0.0%st  
Mem:  264643396k total, 241719372k used, 22924024k free,    36672k buffers  
Swap: 18825200k total,        0k used, 18825200k free, 196557376k cached  

iostat -x  
avg-cpu:  %user   %nice %system %iowait  %steal   %idle  
          35.07    0.00   12.30    0.12    0.00   52.51  
Device:         rrqm/s   wrqm/s     r/s     w/s   rsec/s   wsec/s avgrq-sz avgqu-sz   await  svctm  %util  
dfa   0.00     0.00   57.40  743.40   918.40 11849.00    15.94     0.02    0.03   0.03   2.08  
dfb   0.00     0.00   57.20  740.40   915.20 11829.00    15.98     0.02    0.03   0.03   2.04  
dfc   0.00     0.00   58.40  730.80   934.40 11675.80    15.98     0.03    0.03   0.03   2.52  
dm-0  0.00     0.00  173.00 2213.20  2768.00 35331.40    15.97     0.08    0.03   0.03   7.02

PostgreSQL可以使用oprofile或perf top跟踪统计
参考
http://blog.163.com/digoal@126/blog/static/163877040201549115140794/
找到需要优化的代码就靠它了

CPU: Intel Ivy Bridge microarchitecture, speed 2600 MHz (estimated)  
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (No unit mask) count 100000  
vma      samples  %        app name     symbol name  
007a7780 751274    5.1565  /soft/digoal/soft_bak/pgsql9.5/bin/postgres hash_search_with_hash_value  
004a92f0 574315    3.9419  /soft/digoal/soft_bak/pgsql9.5/bin/postgres _bt_compare  
006a4bd0 514473    3.5312  /soft/digoal/soft_bak/pgsql9.5/bin/postgres LWLockAcquire  
0078a090 510962    3.5071  /soft/digoal/soft_bak/pgsql9.5/bin/postgres SearchCatCache  
007bc3a0 484601    3.3262  /soft/digoal/soft_bak/pgsql9.5/bin/postgres AllocSetAlloc  
006969c0 442341    3.0361  /soft/digoal/soft_bak/pgsql9.5/bin/postgres GetSnapshotData  
00498930 352134    2.4170  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_hot_search_buffer  
005b8f70 279718    1.9199  /soft/digoal/soft_bak/pgsql9.5/bin/postgres ExecInitExpr  
006895d0 249377    1.7117  /soft/digoal/soft_bak/pgsql9.5/bin/postgres PinBuffer  
006a4220 168770    1.1584  /soft/digoal/soft_bak/pgsql9.5/bin/postgres LWLockRelease  
007ac620 161861    1.1110  /soft/digoal/soft_bak/pgsql9.5/bin/postgres pg_encoding_mbcliplen  
007a2180 161090    1.1057  /soft/digoal/soft_bak/pgsql9.5/bin/postgres FunctionCall2Coll  
004aaa80 153079    1.0507  /soft/digoal/soft_bak/pgsql9.5/bin/postgres _bt_checkkeys  
007a3950 147078    1.0095  /soft/digoal/soft_bak/pgsql9.5/bin/postgres fmgr_info_cxt_security  
0049bce0 136680    0.9381  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_page_prune_opt  
0048c8f0 130807    0.8978  /soft/digoal/soft_bak/pgsql9.5/bin/postgres hash_any  
006b2e50 130564    0.8962  /soft/digoal/soft_bak/pgsql9.5/bin/postgres PostgresMain  
0046c790 121776    0.8358  /soft/digoal/soft_bak/pgsql9.5/bin/postgres slot_deform_tuple  
......

阶段1 PostgreSQL 9.5.0 TPmC : 256195.32

阶段1 性能瓶颈分析

系统还有大量空闲CPU，IO资源，所以性能应该不止于此。预计PostgreSQL可到50W tpm。

阶段2 TPC-C 优化

benchmarksql放到另一台主机，主机间万兆网同一交换机下互联。

为了突破测试程序的极限，开4个schema，每个schema负责1000个仓库，数据量总共20亿左右，数据量400GB。
每个测试程序对付一个schema。
终端数保持一致，每个测试程序开24个终端，一共96个终端。

让benchmarksql支持多个schema

benchmarksql 默认编译好的，还有配置都是用的benchmarksql 这个schema，如果我们想对一个数据库用多个schema来压性能，就需要开多个benchmarksql终端来压。
这里就涉及到benchmarksql需要支持多个schema，每个benchmarksql连一个schema。
目录结构：

drwxr-xr-x 2 digoal users 4096 Jan 10 13:24 build  
-rwxr-xr-x 1 digoal users 1112 Jan 10 13:24 build.xml  
drwxr-xr-x 2 digoal users 4096 Jan 10 13:24 dist  
-rw-r--r-- 1 digoal users  128 Jan 10 13:24 env_java.sh  
-rwxr-xr-x 1 digoal users 1927 Jan 10 13:24 HOW-TO-RUN.txt  
drwxr-xr-x 2 digoal users 4096 Jan 10 13:24 lib  
-rwxr-xr-x 1 digoal users 2825 Jan 10 13:24 README.txt  
drwxr-xr-x 3 digoal users 4096 Jan 10 13:24 run  
drwxr-xr-x 6 digoal users 4096 Jan 10 13:24 src

需要修改的地方：

src/LoadData/LoadData.java  
src/client/jTPCCTerminal.java  
run/props.ora  
run/props.pg  
run/sqlIndexCreates  
run/sqlIndexDrops  
run/sqlTableCopies  
run/sqlTableCreates  
run/sqlTableDrops  
run/sqlTableTruncates

把所有的benchmarksql替换成新的schema name，例如 test01

sed -i "s/benchmarksql/test01/g" src/LoadData/LoadData.java  
sed -i "s/benchmarksql/test01/g" src/client/jTPCCTerminal.java  
sed -i "s/benchmarksql/test01/g" run/props.ora  
sed -i "s/benchmarksql/test01/g" run/props.pg  
sed -i "s/benchmarksql/test01/g" run/sqlIndexCreates  
sed -i "s/BENCHMARKSQL/TEST01/g" run/sqlIndexCreates  
sed -i "s/benchmarksql/test01/g" run/sqlIndexDrops  
sed -i "s/benchmarksql/test01/g" run/sqlTableCopies  
sed -i "s/benchmarksql/test01/g" run/sqlTableCreates  
sed -i "s/benchmarksql/test01/g" run/sqlTableDrops  
sed -i "s/benchmarksql/test01/g" run/sqlTableTruncates

然后使用ant重新打包工程，如果没有安装ant，可以用yum install -y ant安装它。

使用ant重新打包benchmarksql.jar

$ant -buildfile ./build.xml   
Buildfile: ./build.xml  
Trying to override old definition of task javac  

init:  

compile:  
    [javac] Compiling 16 source files to /soft/digoal/soft_bak/benchmarksql-4.1.0_oracle01/build  
    [javac] Note: /soft/digoal/soft_bak/benchmarksql-4.1.0_oracle01/src/client/jTPCCTerminal.java uses unchecked or unsafe operations.  
    [javac] Note: Recompile with -Xlint:unchecked for details.  

dist:  
      [jar] Building jar: /soft/digoal/soft_bak/benchmarksql-4.1.0_oracle01/dist/BenchmarkSQL-4.1.jar  

BUILD SUCCESSFUL  
Total time: 2 seconds

现在benchmarksql使用的是test01这个schema。
使用同样的方法，生成支持test02,test03,test04 schema的benchmarksql版本。

创建4个数据库，分别为test01,test02,test03,test04
将阶段1 pg_dump导出的数据导入到这4个数据库, 并将schema重命名为对应的test01,test02,test03,test04
测试数据量

postgres=# \l+  
   List of databases  
   Name    |  Owner   | Encoding | Collate | Ctype |   Access privileges   |  Size   | Tablespace |    Description       
-----------+----------+----------+---------+-------+-----------------------+---------+------------+--------------------------------------------  
 test01    | test01   | UTF8     | C       | C     |           | 100 GB  | pg_default |   
 test02    | test02   | UTF8     | C       | C     |           | 100 GB  | pg_default |   
 test03    | test03   | UTF8     | C       | C     |           | 100 GB  | pg_default |   
 test04    | test04   | UTF8     | C       | C     |           | 100 GB  | pg_default |

benchmarksql软件目录

$ ll  
drwxr-xr-x 7 digoal users 4.0K Jan 10 14:41 benchmarksql-4.1.0_pg01  
drwxr-xr-x 7 digoal users 4.0K Jan 10 14:41 benchmarksql-4.1.0_pg02  
drwxr-xr-x 7 digoal users 4.0K Jan 10 14:41 benchmarksql-4.1.0_pg03  
drwxr-xr-x 7 digoal users 4.0K Jan 10 14:41 benchmarksql-4.1.0_pg04

配置每个benchmarksql的props.pg，修改对应的连接。
例如：

$cat run/props.pg  
driver=org.postgresql.Driver  
conn=jdbc:postgresql://xxx.xxx.xxx.xxx:1921/test01?preparedStatementCacheSizeMiB=10  
user=test01  
password=123  

warehouses=1000  
terminals=20  
//To run specified transactions per terminal- runMins must equal zero  
runTxnsPerTerminal=0  
//To run for specified minutes- runTxnsPerTerminal must equal zero  
runMins=10  
//Number of total transactions per minute  
limitTxnsPerMin=0  

//The following five values must add up to 100  
//The default percentages of 45, 43, 4, 4 & 4 match the TPC-C spec  
newOrderWeight=40  
paymentWeight=36  
orderStatusWeight=8  
deliveryWeight=8  
stockLevelWeight=8

配置数据库pg_hba.conf，允许测试机连接。

vi $PGDATA/pg_hba.conf  
host all all 0.0.0.0/0 md5  
pg_ctl reload

阶段2 TPC-C 压测

cd benchmarksql-4.1.0_pg01/run  
nohup ./runBenchmark.sh props.pg >/dev/null 2>./errrun.log &  
cd ../../benchmarksql-4.1.0_pg02/run  
nohup ./runBenchmark.sh props.pg >/dev/null 2>./errrun.log &  
cd ../../benchmarksql-4.1.0_pg03/run  
nohup ./runBenchmark.sh props.pg >/dev/null 2>./errrun.log &  
cd ../../benchmarksql-4.1.0_pg04/run  
nohup ./runBenchmark.sh props.pg >/dev/null 2>./errrun.log &  
cd ../..

阶段2 PostgreSQL 9.5.0 TPmC : 453058.64

$ cat benchmarksql-4.1.0_pg01/run/log/benchmarksql.log   
 INFO   [2016-01-10 17:54:04.925]       Thread-22       Term-00, Measured tpmC (NewOrders) = 45416.28  
 INFO   [2016-01-10 17:54:04.925]       Thread-22       Term-00, Measured tpmTOTAL = 113487.61  
 INFO   [2016-01-10 17:54:04.925]       Thread-22       Term-00, Session Start     = 2016-01-10 17:44:04  
 INFO   [2016-01-10 17:54:04.925]       Thread-22       Term-00, Session End       = 2016-01-10 17:54:04  
 INFO   [2016-01-10 17:54:04.925]       Thread-22       Term-00, Transaction Count = 1134913  
$ cat benchmarksql-4.1.0_pg02/run/log/benchmarksql.log   
 INFO   [2016-01-10 17:54:04.943]       Thread-12       Term-00, Measured tpmC (NewOrders) = 45292.48  
 INFO   [2016-01-10 17:54:04.943]       Thread-12       Term-00, Measured tpmTOTAL = 113269.54  
 INFO   [2016-01-10 17:54:04.943]       Thread-12       Term-00, Session Start     = 2016-01-10 17:44:04  
 INFO   [2016-01-10 17:54:04.944]       Thread-12       Term-00, Session End       = 2016-01-10 17:54:04  
 INFO   [2016-01-10 17:54:04.944]       Thread-12       Term-00, Transaction Count = 1132770  
$ cat benchmarksql-4.1.0_pg03/run/log/benchmarksql.log   
 INFO   [2016-01-10 17:54:04.955]       Thread-12       Term-00, Measured tpmC (NewOrders) = 45336.15  
 INFO   [2016-01-10 17:54:04.955]       Thread-12       Term-00, Measured tpmTOTAL = 113247.19  
 INFO   [2016-01-10 17:54:04.956]       Thread-12       Term-00, Session Start     = 2016-01-10 17:44:04  
 INFO   [2016-01-10 17:54:04.956]       Thread-12       Term-00, Session End       = 2016-01-10 17:54:04  
 INFO   [2016-01-10 17:54:04.956]       Thread-12       Term-00, Transaction Count = 1132537  
$ cat benchmarksql-4.1.0_pg04/run/log/benchmarksql.log   
 INFO   [2016-01-10 17:54:04.986]       Thread-23       Term-00, Measured tpmC (NewOrders) = 45231.67  
 INFO   [2016-01-10 17:54:04.987]       Thread-23       Term-00, Measured tpmTOTAL = 113054.3  
 INFO   [2016-01-10 17:54:04.987]       Thread-23       Term-00, Session Start     = 2016-01-10 17:44:04  
 INFO   [2016-01-10 17:54:04.987]       Thread-23       Term-00, Session End       = 2016-01-10 17:54:04  
 INFO   [2016-01-10 17:54:04.987]       Thread-23       Term-00, Transaction Count = 1130640  

TPM ：   
113487.61 + 113269.54 + 113247.19 + 113054.3 =  453058.64

第9分钟操作系统统计信息

TOP  
top - 17:38:27 up 4 days,  8:32,  4 users,  load average: 78.54, 68.64, 37.22  
Tasks: 658 total,  34 running, 624 sleeping,   0 stopped,   0 zombie  
Cpu(s): 70.2%us, 15.7%sy,  0.0%ni,  5.5%id,  1.5%wa,  0.0%hi,  7.1%si,  0.0%st  
Mem:  264643396k total, 229866068k used, 34777328k free,    59652k buffers  
Swap: 18825200k total,        0k used, 18825200k free, 183529592k cached  

iostat -x  
avg-cpu:  %user   %nice %system %iowait  %steal   %idle  
          71.39    0.00   22.47    1.26    0.00    4.88  
Device:         rrqm/s   wrqm/s     r/s     w/s   rsec/s   wsec/s avgrq-sz avgqu-sz   await  svctm  %util  
dfa   0.00     0.00 3659.33 7008.67 58538.67 112050.67    15.99     5.85    0.55   0.06  68.17  
dfb   0.00     0.00 3714.67 6888.67 59418.67 110173.33    15.99     5.98    0.56   0.06  67.87  
dfc   0.00     0.00 3709.00 6974.33 59328.00 111504.00    15.99     5.63    0.52   0.07  71.60  
dm-0  0.00     0.00 11083.00 20870.33 177285.33 333706.67    15.99    17.60    0.55   0.03  92.10

测试过程oprofile报告

#/home/digoal/oprof/bin/opreport -l -f -w -x -t 0.5  
Using /soft/digoal/soft_bak/oprof_test/oprofile_data/samples/ for samples directory.  

WARNING! Some of the events were throttled. Throttling occurs when  
the initial sample rate is too high, causing an excessive number of  
interrupts.  Decrease the sampling frequency. Check the directory  
/soft/digoal/soft_bak/oprof_test/oprofile_data/samples/current/stats/throttled  
for the throttled event names.  

CPU: Intel Ivy Bridge microarchitecture, speed 2600 MHz (estimated)  
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (No unit mask) count 100000  
vma      samples  %        app name     symbol name  
007a7780 2632700   5.2511  /soft/digoal/soft_bak/pgsql9.5/bin/postgres hash_search_with_hash_value  
004a92f0 1895924   3.7816  /soft/digoal/soft_bak/pgsql9.5/bin/postgres _bt_compare  
006969c0 1844371   3.6787  /soft/digoal/soft_bak/pgsql9.5/bin/postgres GetSnapshotData  
0078a090 1775031   3.5404  /soft/digoal/soft_bak/pgsql9.5/bin/postgres SearchCatCache  
006a4bd0 1725350   3.4413  /soft/digoal/soft_bak/pgsql9.5/bin/postgres LWLockAcquire  
007bc3a0 1565190   3.1219  /soft/digoal/soft_bak/pgsql9.5/bin/postgres AllocSetAlloc  
00498930 1406694   2.8058  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_hot_search_buffer  
005b8f70 965646    1.9261  /soft/digoal/soft_bak/pgsql9.5/bin/postgres ExecInitExpr  
006895d0 767078    1.5300  /soft/digoal/soft_bak/pgsql9.5/bin/postgres PinBuffer  
004aaa80 617741    1.2321  /soft/digoal/soft_bak/pgsql9.5/bin/postgres _bt_checkkeys  
007a2180 588043    1.1729  /soft/digoal/soft_bak/pgsql9.5/bin/postgres FunctionCall2Coll  
006a4220 575864    1.1486  /soft/digoal/soft_bak/pgsql9.5/bin/postgres LWLockRelease  
007ac620 485162    0.9677  /soft/digoal/soft_bak/pgsql9.5/bin/postgres pg_encoding_mbcliplen  
007a3950 471102    0.9396  /soft/digoal/soft_bak/pgsql9.5/bin/postgres fmgr_info_cxt_security  
0046c790 441548    0.8807  /soft/digoal/soft_bak/pgsql9.5/bin/postgres slot_deform_tuple  
0048c8f0 425867    0.8494  /soft/digoal/soft_bak/pgsql9.5/bin/postgres hash_any  
006b2e50 404548    0.8069  /soft/digoal/soft_bak/pgsql9.5/bin/postgres PostgresMain  
007bd0f0 396510    0.7909  /soft/digoal/soft_bak/pgsql9.5/bin/postgres palloc  
0049bce0 394201    0.7863  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_page_prune_opt  
007bce00 353243    0.7046  /soft/digoal/soft_bak/pgsql9.5/bin/postgres pfree  
0049b300 335896    0.6700  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_page_prune  
0046c580 313145    0.6246  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_getsysattr  
006b14a0 311776    0.6219  /soft/digoal/soft_bak/pgsql9.5/bin/postgres exec_bind_message  
007cb070 292106    0.5826  /soft/digoal/soft_bak/pgsql9.5/bin/postgres HeapTupleSatisfiesMVCC  
007bd210 275282    0.5491  /soft/digoal/soft_bak/pgsql9.5/bin/postgres MemoryContextAllocZeroAligned  
005b8530 273199    0.5449  /soft/digoal/soft_bak/pgsql9.5/bin/postgres ExecProject  
00494ba0 266495    0.5315  /soft/digoal/soft_bak/pgsql9.5/bin/postgres heap_update  
007bca10 265556    0.5297  /soft/digoal/soft_bak/pgsql9.5/bin/postgres AllocSetFree

阶段2 性能瓶颈分析

单次IO请求响应较高，在0.06毫秒
系统调用占用的CPU百分比较高
数据库获取快照占用CPU较高，需要代码层优化

阶段3 TPC-C 优化

开启PostgreSQL 预读, 预读数(n-1), n是条带数, 所以本例case effective_io_concurrency = 2
(这个使用xfs的largeio参数效果是类似的，还有块设备的预读功能)
(开启预读可能存在IO浪费的情况，例如全BUFFER命中的情况下。预读对于OLAP非常有效)
开启大页支持, 开到168G;

/etc/sysctl.conf  
  vm.nr_hugepages = 102352  
sysctl -p  
/etc/security/limits.conf  
  * soft   memlock    250000000  
  * hard   memlock    250000000  
  #memlock    大于  nr_hugepages   大于  shared_buffers

使用数据块分组提交, commit_delay = 10, commit_siblings = 16
平滑检查点到0.8个周期，减少fsync dirty page IO影响。

http://blog.163.com/digoal@126/blog/static/1638770402016011115141697/  
shared_buffers = 164GB       # min 128kB  
huge_pages = on           # on, off, or try  
maintenance_work_mem = 2GB  # min 1MB  
wal_buffers = 16MB           # min 32kB, -1 sets based on shared_buffers  
wal_writer_delay = 10ms         # 1-10000 milliseconds  
commit_delay = 10           # range 0-100000, in microseconds  
commit_siblings = 16        # range 1-1000  
checkpoint_timeout = 35min  # range 30s-1h  
max_wal_size = 320GB  
checkpoint_completion_target = 0.8     # checkpoint target duration, 0.0 - 1.0  
effective_cache_size = 240GB  
log_destination = 'csvlog'  # Valid values are combinations of  
logging_collector = on          # Enable capturing of stderr and csvlog  
log_truncate_on_rotation = on           # If on, an existing log file with the

阶段3 TPC-C 压测

$tail -n 5 benchmarksql-4.1.0_pg01/run/log/benchmarksql.log   
 INFO   [2016-01-11 13:33:55.917]       Thread-14       Term-00, Measured tpmC (NewOrders) = 48151.07  
 INFO   [2016-01-11 13:33:55.917]       Thread-14       Term-00, Measured tpmTOTAL = 120215.48  
 INFO   [2016-01-11 13:33:55.917]       Thread-14       Term-00, Session Start     = 2016-01-11 13:23:55  
 INFO   [2016-01-11 13:33:55.917]       Thread-14       Term-00, Session End       = 2016-01-11 13:33:55  
 INFO   [2016-01-11 13:33:55.917]       Thread-14       Term-00, Transaction Count = 1202222  

$tail -n 5 benchmarksql-4.1.0_pg02/run/log/benchmarksql.log   
 INFO   [2016-01-11 13:33:55.971]       Thread-16       Term-00, Measured tpmC (NewOrders) = 48505.54  
 INFO   [2016-01-11 13:33:55.971]       Thread-16       Term-00, Measured tpmTOTAL = 121182.26  
 INFO   [2016-01-11 13:33:55.971]       Thread-16       Term-00, Session Start     = 2016-01-11 13:23:55  
 INFO   [2016-01-11 13:33:55.972]       Thread-16       Term-00, Session End       = 2016-01-11 13:33:55  
 INFO   [2016-01-11 13:33:55.972]       Thread-16       Term-00, Transaction Count = 1211858  

$tail -n 5 benchmarksql-4.1.0_pg03/run/log/benchmarksql.log   
 INFO   [2016-01-11 13:33:55.985]       Thread-4        Term-00, Measured tpmC (NewOrders) = 48119.61  
 INFO   [2016-01-11 13:33:55.985]       Thread-4        Term-00, Measured tpmTOTAL = 120523.98  
 INFO   [2016-01-11 13:33:55.985]       Thread-4        Term-00, Session Start     = 2016-01-11 13:23:55  
 INFO   [2016-01-11 13:33:55.985]       Thread-4        Term-00, Session End       = 2016-01-11 13:33:55  
 INFO   [2016-01-11 13:33:55.985]       Thread-4        Term-00, Transaction Count = 1205271  

$tail -n 5 benchmarksql-4.1.0_pg04/run/log/benchmarksql.log   
 INFO   [2016-01-11 13:33:55.958]       Thread-21       Term-00, Measured tpmC (NewOrders) = 48087.55  
 INFO   [2016-01-11 13:33:55.958]       Thread-21       Term-00, Measured tpmTOTAL = 120461.29  
 INFO   [2016-01-11 13:33:55.958]       Thread-21       Term-00, Session Start     = 2016-01-11 13:23:55  
 INFO   [2016-01-11 13:33:55.958]       Thread-21       Term-00, Session End       = 2016-01-11 13:33:55  
 INFO   [2016-01-11 13:33:55.958]       Thread-21       Term-00, Transaction Count = 1204638  
TPM:  
120215.48 + 121182.26 + 120523.98 + 120461.29 = 482383.01

阶段3 PostgreSQL 9.5.0 TPmC : 482383.01

阶段3 性能瓶颈分析

操作系统后台刷脏页的数据量太大，容易带来抖动
优化并发数，减少事务快照CPU开销
优化work_mem，减少文件排序
优化分组提交阈值

阶段4 TPC-C 优化

优化分组提交的时延，最小结束点并发事务数量，work_mem等。
操作系统内核参数优化，
优化老化脏页刷新间隔
vm.dirty_writeback_centisecs=10
优化老化脏页阈值
vm.dirty_expire_centisecs=6000
优化用户进程刷脏页阈值
vm.dirty_ratio=80
优化内核进程刷脏页阈值
vm.dirty_background_bytes=102400000
优化终端数，每个benchmarksql 20个终端，一共80个终端。

参数

listen_addresses = '0.0.0.0'         # what IP address(es) to listen on;  
port = 1921     # (change requires restart)  
max_connections = 300       # (change requires restart)  
unix_socket_directories = '.'   # comma-separated list of directories  
shared_buffers = 164GB       # min 128kB  
huge_pages = on           # on, off, or try  
work_mem = 256MB # min 64kB  
maintenance_work_mem = 2GB  # min 1MB  
autovacuum_work_mem = 2GB   # min 1MB, or -1 to use maintenance_work_mem  
dynamic_shared_memory_type = mmap      # the default is the first option  
vacuum_cost_delay = 10      # 0-100 milliseconds  
vacuum_cost_limit = 10000    # 1-10000 credits  
bgwriter_delay = 10ms       # 10-10000ms between rounds  
bgwriter_lru_maxpages = 1000# 0-1000 max buffers written/round  
bgwriter_lru_multiplier = 10.0          # 0-10.0 multipler on buffers scanned/round  
effective_io_concurrency = 2           # 1-1000; 0 disables prefetching  
wal_level = minimal  # minimal, archive, hot_standby, or logical  
synchronous_commit = off    # synchronization level;  
full_page_writes = off      # recover from partial page writes  
wal_buffers = 1GB           # min 32kB, -1 sets based on shared_buffers  
wal_writer_delay = 10ms         # 1-10000 milliseconds  
commit_delay = 10           # range 0-100000, in microseconds  
commit_siblings = 6        # range 1-1000  
checkpoint_timeout = 55min  # range 30s-1h  
max_wal_size = 320GB  
checkpoint_completion_target = 0.99     # checkpoint target duration, 0.0 - 1.0  
random_page_cost = 1.0     # same scale as above  
effective_cache_size = 240GB  
log_destination = 'csvlog'  # Valid values are combinations of  
logging_collector = on          # Enable capturing of stderr and csvlog  
log_truncate_on_rotation = on           # If on, an existing log file with the  
log_timezone = 'PRC'  
update_process_title = off  
track_activities = off  
autovacuum = on# Enable autovacuum subprocess?  'on'

重启数据库

pg_ctl restart

将数据加载到shared buffer

psql  
\c test01 test01  
explain analyze select * from customer; explain analyze select * from stock;  
\c test02 test02  
explain analyze select * from customer; explain analyze select * from stock;  
\c test03 test03  
explain analyze select * from customer; explain analyze select * from stock;  
\c test04 test04  
explain analyze select * from customer; explain analyze select * from stock;

阶段4 TPC-C 压测

$ tail -n 5 benchmarksql-4.1.0_pg01/run/log/benchmarksql.log   
 INFO   [2016-01-12 11:55:09.461]       Thread-12       Term-00, Measured tpmC (NewOrders) = 57995.55  
 INFO   [2016-01-12 11:55:09.461]       Thread-12       Term-00, Measured tpmTOTAL = 144975.59  
 INFO   [2016-01-12 11:55:09.461]       Thread-12       Term-00, Session Start     = 2016-01-12 11:45:09  
 INFO   [2016-01-12 11:55:09.461]       Thread-12       Term-00, Session End       = 2016-01-12 11:55:09  
 INFO   [2016-01-12 11:55:09.462]       Thread-12       Term-00, Transaction Count = 1449796  
$ tail -n 5 benchmarksql-4.1.0_pg02/run/log/benchmarksql.log   
 INFO   [2016-01-12 11:55:09.499]       Thread-0        Term-00, Measured tpmC (NewOrders) = 58013.75  
 INFO   [2016-01-12 11:55:09.499]       Thread-0        Term-00, Measured tpmTOTAL = 145006.74  
 INFO   [2016-01-12 11:55:09.499]       Thread-0        Term-00, Session Start     = 2016-01-12 11:45:09  
 INFO   [2016-01-12 11:55:09.500]       Thread-0        Term-00, Session End       = 2016-01-12 11:55:09  
 INFO   [2016-01-12 11:55:09.500]       Thread-0        Term-00, Transaction Count = 1450110  
$ tail -n 5 benchmarksql-4.1.0_pg03/run/log/benchmarksql.log   
 INFO   [2016-01-12 11:55:09.541]       Thread-14       Term-00, Measured tpmC (NewOrders) = 57322.05  
 INFO   [2016-01-12 11:55:09.541]       Thread-14       Term-00, Measured tpmTOTAL = 143227.03  
 INFO   [2016-01-12 11:55:09.542]       Thread-14       Term-00, Session Start     = 2016-01-12 11:45:09  
 INFO   [2016-01-12 11:55:09.542]       Thread-14       Term-00, Session End       = 2016-01-12 11:55:09  
 INFO   [2016-01-12 11:55:09.542]       Thread-14       Term-00, Transaction Count = 1432298  
$ tail -n 5 benchmarksql-4.1.0_pg04/run/log/benchmarksql.log   
 INFO   [2016-01-12 11:55:09.574]       Thread-7        Term-00, Measured tpmC (NewOrders) = 57863.92  
 INFO   [2016-01-12 11:55:09.574]       Thread-7        Term-00, Measured tpmTOTAL = 144596.45  
 INFO   [2016-01-12 11:55:09.575]       Thread-7        Term-00, Session Start     = 2016-01-12 11:45:09  
 INFO   [2016-01-12 11:55:09.575]       Thread-7        Term-00, Session End       = 2016-01-12 11:55:09  
 INFO   [2016-01-12 11:55:09.575]       Thread-7        Term-00, Transaction Count = 1445978  
TPM：  
144975.59 + 145006.74 + 143227.03 + 144596.45 = 577805.81

阶段4 PostgreSQL 9.5.0 TPmC : 577805.81

阶段4 性能瓶颈分析

无明显瓶颈，需要从编译器，代码方面入手优化。

阶段5 TPC-C 优化

gcc编译器版本更新
http://blog.163.com/digoal@126/blog/static/163877040201601313814429/
INTEL编译器
https://software.intel.com/en-us/intel-compilers
CLANG编译器
http://blog.163.com/digoal@126/blog/static/163877040201601382640309/
使用gcc 4.9.3版本，更新CFLAGS，重新编译

$ export LD_LIBRARY_PATH=/u02/digoal/gcc4.9.3/lib:/u02/digoal/cloog/lib:/u02/digoal/gmp/lib:/u02/digoal/isl/lib:/u02/digoal/mpc/lib:/u02/digoal/mpfr/lib:$LD_LIBRARY_PATH  
$ export PATH=/u02/digoal/gcc4.9.3/bin:$PATH  

$ CFLAGS="-O3 -march=native -flto" CC=/u02/digoal/gcc4.9.3/bin/gcc ./configure --prefix=/u02/digoal/soft_bak/pgsql9.5 --with-blocksize=8 --with-pgport=1921 --with-perl --with-python --with-tcl --with-openssl --with-pam --with-ldap --with-libxml --with-libxslt --enable-thread-safety --with-wal-segsize=64  

$ make world -j 32  
$ make install-world -j 32

阶段5 TPC-C 压测

digoal tail -n 5 benchmarksql-4.1.0_pg01/run/log/benchmarksql.log   
 INFO   [2016-01-13 02:00:49.699]       Thread-15       Term-00, Measured tpmC (NewOrders) = 59092.33  
 INFO   [2016-01-13 02:00:49.699]       Thread-15       Term-00, Measured tpmTOTAL = 147832.44  
 INFO   [2016-01-13 02:00:49.699]       Thread-15       Term-00, Session Start     = 2016-01-13 01:50:49  
 INFO   [2016-01-13 02:00:49.699]       Thread-15       Term-00, Session End       = 2016-01-13 02:00:49  
 INFO   [2016-01-13 02:00:49.699]       Thread-15       Term-00, Transaction Count = 1478385  
digoal tail -n 5 benchmarksql-4.1.0_pg02/run/log/benchmarksql.log   
 INFO   [2016-01-13 02:00:49.704]       Thread-0        Term-00, Measured tpmC (NewOrders) = 60051.49  
 INFO   [2016-01-13 02:00:49.704]       Thread-0        Term-00, Measured tpmTOTAL = 150231.54  
 INFO   [2016-01-13 02:00:49.704]       Thread-0        Term-00, Session Start     = 2016-01-13 01:50:49  
 INFO   [2016-01-13 02:00:49.704]       Thread-0        Term-00, Session End       = 2016-01-13 02:00:49  
 INFO   [2016-01-13 02:00:49.704]       Thread-0        Term-00, Transaction Count = 1502367  
digoal tail -n 5 benchmarksql-4.1.0_pg03/run/log/benchmarksql.log   
 INFO   [2016-01-13 02:00:49.693]       Thread-16       Term-00, Measured tpmC (NewOrders) = 60273.99  
 INFO   [2016-01-13 02:00:49.694]       Thread-16       Term-00, Measured tpmTOTAL = 150601.93  
 INFO   [2016-01-13 02:00:49.694]       Thread-16       Term-00, Session Start     = 2016-01-13 01:50:49  
 INFO   [2016-01-13 02:00:49.694]       Thread-16       Term-00, Session End       = 2016-01-13 02:00:49  
 INFO   [2016-01-13 02:00:49.694]       Thread-16       Term-00, Transaction Count = 1506066  
digoal tail -n 5 benchmarksql-4.1.0_pg04/run/log/benchmarksql.log   
 INFO   [2016-01-13 02:00:49.715]       Thread-18       Term-00, Measured tpmC (NewOrders) = 60180.69  
 INFO   [2016-01-13 02:00:49.715]       Thread-18       Term-00, Measured tpmTOTAL = 150591.78  
 INFO   [2016-01-13 02:00:49.716]       Thread-18       Term-00, Session Start     = 2016-01-13 01:50:49  
 INFO   [2016-01-13 02:00:49.716]       Thread-18       Term-00, Session End       = 2016-01-13 02:00:49  
 INFO   [2016-01-13 02:00:49.716]       Thread-18       Term-00, Transaction Count = 1505962  

TPM  
599257.69

阶段5 PostgreSQL 9.5.0 TPmC : 599257.69

阶段5 性能瓶颈分析

更换CLANG编译器。

阶段6 TPC-C 优化

CLANG编译
http://blog.163.com/digoal@126/blog/static/163877040201601421045406/
使用clang编译

CC=/u02/digoal/llvm/bin/clang CFLAGS="-O2 -fstrict-enums" ./configure --prefix=/u02/digoal/soft_bak/pgsql9.5  --with-pgport=1921 --with-perl --with-python --with-tcl --with-openssl --with-pam --with-ldap --with-libxml --with-libxslt --enable-thread-safety  
make world -j 32  
make install-world -j 32

阶段6 TPC-C 压测

$ tail -n 5 benchmarksql-4.1.0_pg01/run/log/benchmarksql.log   
 INFO   [2016-01-16 07:21:58.070]       Thread-12       Term-00, Measured tpmC (NewOrders) = 60519.19  
 INFO   [2016-01-16 07:21:58.070]       Thread-12       Term-00, Measured tpmTOTAL = 151235.02  
 INFO   [2016-01-16 07:21:58.070]       Thread-12       Term-00, Session Start     = 2016-01-16 07:11:58  
 INFO   [2016-01-16 07:21:58.071]       Thread-12       Term-00, Session End       = 2016-01-16 07:21:58  
 INFO   [2016-01-16 07:21:58.071]       Thread-12       Term-00, Transaction Count = 1512377  
$ tail -n 5 benchmarksql-4.1.0_pg02/run/log/benchmarksql.log   
 INFO   [2016-01-16 07:21:58.180]       Thread-15       Term-00, Measured tpmC (NewOrders) = 60924.87  
 INFO   [2016-01-16 07:21:58.180]       Thread-15       Term-00, Measured tpmTOTAL = 152126.73  
 INFO   [2016-01-16 07:21:58.180]       Thread-15       Term-00, Session Start     = 2016-01-16 07:11:58  
 INFO   [2016-01-16 07:21:58.180]       Thread-15       Term-00, Session End       = 2016-01-16 07:21:58  
 INFO   [2016-01-16 07:21:58.180]       Thread-15       Term-00, Transaction Count = 1521312  
$ tail -n 5 benchmarksql-4.1.0_pg03/run/log/benchmarksql.log   
 INFO   [2016-01-16 07:21:58.198]       Thread-0        Term-00, Measured tpmC (NewOrders) = 60481.19  
 INFO   [2016-01-16 07:21:58.198]       Thread-0        Term-00, Measured tpmTOTAL = 151294.63  
 INFO   [2016-01-16 07:21:58.199]       Thread-0        Term-00, Session Start     = 2016-01-16 07:11:58  
 INFO   [2016-01-16 07:21:58.199]       Thread-0        Term-00, Session End       = 2016-01-16 07:21:58  
 INFO   [2016-01-16 07:21:58.199]       Thread-0        Term-00, Transaction Count = 1512968  
$ tail -n 5 benchmarksql-4.1.0_pg04/run/log/benchmarksql.log   
 INFO   [2016-01-16 07:21:58.200]       Thread-5        Term-00, Measured tpmC (NewOrders) = 60715.57  
 INFO   [2016-01-16 07:21:58.200]       Thread-5        Term-00, Measured tpmTOTAL = 151809.93  
 INFO   [2016-01-16 07:21:58.200]       Thread-5        Term-00, Session Start     = 2016-01-16 07:11:58  
 INFO   [2016-01-16 07:21:58.200]       Thread-5        Term-00, Session End       = 2016-01-16 07:21:58  
 INFO   [2016-01-16 07:21:58.200]       Thread-5        Term-00, Transaction Count = 1518149  
TPM:  
606466.31

阶段6 PostgreSQL 9.5.0 TPmC : 606466.31

当前perf top

 samples  pcnt function  DSO  
 _______ _____ _________________________________ __________________________________________  

15900.00  3.2% hash_search_with_hash_value       /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
13970.00  2.8% _bt_compare           /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
13215.00  2.6% AllocSetAlloc         /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
10678.00  2.1% LWLockAcquire         /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
10298.00  2.1% memcpy    /lib64/libc-2.12.so             
 9016.00  1.8% SearchCatCache        /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 8577.00  1.7% heap_hot_search_buffer/u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 8059.00  1.6% GetSnapshotData       /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 6975.00  1.4% ExecInitExpr          /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 6517.00  1.3% fmgr_info_cxt_security/u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 5232.00  1.0% PostgresMain          /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 4328.00  0.9% LWLockRelease         /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 4044.00  0.8% PinBuffer /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 4037.00  0.8% _int_malloc           /lib64/libc-2.12.so             
 4026.00  0.8% StrategyGetBuffer     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 3777.00  0.8% slot_deform_tuple     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 3755.00  0.7% FunctionCall2Coll     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 3741.00  0.7% __GI_vfprintf         /lib64/libc-2.12.so             
 3403.00  0.7% __strncpy_ssse3       /lib64/libc-2.12.so             
 3305.00  0.7% aliflash_reconfig_task[aliflash]          
 3090.00  0.6% _bt_checkkeys         /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 3012.00  0.6% __memset_sse2         /lib64/libc-2.12.so             
 2881.00  0.6% palloc    /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2698.00  0.5% __strlen_sse42        /lib64/libc-2.12.so             
 2585.00  0.5% _int_free /lib64/libc-2.12.so             
 2505.00  0.5% heap_page_prune       /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2495.00  0.5% hash_any  /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2442.00  0.5% heap_page_prune_opt   /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2437.00  0.5% __schedule[kernel.kallsyms]   
 2210.00  0.4% MemoryContextAllocZeroAligned     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2111.00  0.4% pfree     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2048.00  0.4% heap_update           /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 2012.00  0.4% update_blocked_averages           [kernel.kallsyms]   
 1937.00  0.4% __switch_to           [kernel.kallsyms]   
 1925.00  0.4% heap_getsysattr       /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1916.00  0.4% TupleDescInitEntry    /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1905.00  0.4% irq_entries_start     [kernel.kallsyms]   
 1863.00  0.4% AllocSetFree          /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1821.00  0.4% _wordcopy_bwd_aligned /lib64/libc-2.12.so             
 1761.00  0.4% _raw_spin_lock        [kernel.kallsyms]   
 1758.00  0.4% check_stack_depth     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1749.00  0.3% _bt_binsrch           /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1748.00  0.3% ReadBuffer_common     /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1747.00  0.3% expression_tree_walker/u02/digoal/soft_bak/pgsql9.5/bin/postgres  
 1651.00  0.3% __GI___libc_malloc    /lib64/libc-2.12.so             
 1608.00  0.3% __memcmp_sse4_1       /lib64/libc-2.12.so             
 1586.00  0.3% LockAcquireExtended   /u02/digoal/soft_bak/pgsql9.5/bin/postgres  
------------------------------------------------------------------------------------------------------------

阶段6 性能瓶颈分析

其他本文未尝试的优化手段

有兴趣的朋友可以试试：
1. 使用interl的icc编译一下，看看性能还能不能提升。
2. 关闭表的自动analyze, 关闭日志表的autovacuum和auto analyze.
3. PostgreSQL jdbc有一些参数可以优化，本文还未处理。例如防止类型转换，QUERY plan CACHE size。
http://www.postgresql.org/docs/9.2/interactive/libpq-connect.html
4. PostgreSQL 代码层也有优化的空间，例如分区表的代码，快照的优化。

总结

内核参数优化总结
以及每项配置的原理

vm.swappiness = 0     #  关闭交换分区
kernel.shmmax=135497418752     # 最大共享内存段大小
net.core.rmem_max = 4194304   # The maximum receive socket buffer size in bytes
net.core.wmem_max = 4194304    # The maximum send socket buffer size in bytes.
net.core.rmem_default = 262144   # The default setting of the socket receive buffer in bytes.
net.core.wmem_default = 262144   # The default setting (in bytes) of the socket send buffer.
net.ipv4.ip_local_port_range = 9000 65535    # 本地自动分配的TCP UDP端口号范围
kernel.sem = 50100 64128000 50100 1280     # 信号量
vm.dirty_background_bytes = 102400000      # 系统脏页到达这个值，系统后台刷脏页调度进程 pdflush（或其他） 自动将(dirty_expire_centisecs/100）秒前的脏页刷到磁盘
vm.dirty_expire_centisecs = 6000    #  比这个值老的脏页，将被刷到磁盘。6000表示60秒。
vm.dirty_writeback_centisecs = 50  # pdflush（或其他）后台刷脏页进程的唤醒间隔， 50表示0.5秒。
vm.dirty_ratio = 80        #  如果系统进程刷脏页太慢，使得系统脏页超过内存 80 % 时，则用户进程如果有写磁盘的操作（如fsync, fdatasync等调用），则需要主动把系统脏页刷出。
vm.nr_hugepages = 102352    #  大页数量，乘以/proc/meminfo Hugepagesize就是内存数量。
vm.overcommit_memory = 2     #  在分配内存时，不允许over malloc
vm.overcommit_ratio = 90     #  当overcommit_memory = 2 时，用于参与计算允许指派的内存大小。

内存分配策略解释
参考
http://blog.163.com/digoal@126/blog/static/163877040201563044143325/

当vm.overcommit_memory=0时，不允许普通用户overcommit, 但是允许root用户轻微的overcommit。  
当vm.overcommit_memory=1时，允许overcommit. 比较危险。  
当vm.overcommit_memory=2时，Committed_AS不能大于CommitLimit。
commit 限制 计算方法
              The CommitLimit is calculated with the following formula:
              CommitLimit = ([total RAM pages] - [total huge TLB pages]) *
              overcommit_ratio / 100 + [total swap pages]
              For example, on a system with 1G of physical RAM and 7G
              of swap with a `vm.overcommit_ratio` of 30 it would
              yield a CommitLimit of 7.3G.
[root@digoal postgresql-9.4.4]# free
             total       used       free     shared    buffers     cached
Mem:       1914436     713976    1200460      72588      32384     529364
-/+ buffers/cache:     152228    1762208
Swap:      1048572     542080     506492
[root@digoal ~]# cat /proc/meminfo |grep Commit
CommitLimit:     2005788 kB
Committed_AS:     132384 kB
这个例子的2G就是以上公式计算得来。

overcommit限制的初衷是malloc后，内存并不是立即使用掉，所以如果多个进程同时申请一批内存的话，不允许OVERCOMMIT可能导致某些进程申请内存失败，但实际上内存是还有的。所以Linux内核给出了几种选择，2是比较靠谱或者温柔的做法。1的话风险有点大，因为可能会导致OOM。

所以当数据库无法启动时，要么你降低一下数据库申请内存的大小（例如降低shared_buffer或者max conn），要么就是修改一下overcommit的风格。

内核启动参数优化总结
关闭numa
使用deadline调度IO

kernel /vmlinuz-3.18.24 numa=off elevator=deadline intel_idle.max_cstate=0 scsi_mod.scan=sync

块设备优化总结，预读

blockdev --setra 16384 /dev/dfa  
blockdev --setra 16384 /dev/dfb  
blockdev --setra 16384 /dev/dfc  
blockdev --setra 16384 /dev/dm-0

数据库参数优化总结

max_connections = 300       # (change requires restart)  
unix_socket_directories = '.'   # comma-separated list of directories  
shared_buffers = 194GB       # 尽量用数据库管理内存，减少双重缓存，提高使用效率  
huge_pages = on           # on, off, or try  ，使用大页
work_mem = 256MB # min 64kB  ， 减少外部文件排序的可能，提高效率
maintenance_work_mem = 2GB  # min 1MB  ， 加速建立索引
autovacuum_work_mem = 2GB   # min 1MB, or -1 to use maintenance_work_mem  ， 加速垃圾回收
dynamic_shared_memory_type = mmap      # the default is the first option  
vacuum_cost_delay = 0      # 0-100 milliseconds   ， 垃圾回收不妥协，极限压力下，减少膨胀可能性
bgwriter_delay = 10ms       # 10-10000ms between rounds    ， 刷shared buffer脏页的进程调度间隔，尽量高频调度，减少用户进程申请不到内存而需要主动刷脏页的可能（导致RT升高）。
bgwriter_lru_maxpages = 1000   # 0-1000 max buffers written/round ,  一次最多刷多少脏页
bgwriter_lru_multiplier = 10.0          # 0-10.0 multipler on buffers scanned/round  一次扫描多少个块，上次刷出脏页数量的倍数
effective_io_concurrency = 2           # 1-1000; 0 disables prefetching ， 执行节点为bitmap heap scan时，预读的块数。从而
wal_level = minimal         # minimal, archive, hot_standby, or logical ， 如果现实环境，建议开启归档。  
synchronous_commit = off    # synchronization level;    ， 异步提交  
wal_sync_method = open_sync    # the default is the first option  ， 因为没有standby，所以写xlog选择一个支持O_DIRECT的fsync方法。  
full_page_writes = off      # recover from partial page writes  ， 生产中，如果有增量备份和归档，可以关闭，提高性能。  
wal_buffers = 1GB           # min 32kB, -1 sets based on shared_buffers  ，wal buffer大小，如果大量写wal buffer等待，则可以加大。
wal_writer_delay = 10ms         # 1-10000 milliseconds  wal buffer调度间隔，和bg writer delay类似。
commit_delay = 20           # range 0-100000, in microseconds  ，分组提交的等待时间
commit_siblings = 9        # range 1-1000  , 有多少个事务同时进入提交阶段时，就触发分组提交。
checkpoint_timeout = 55min  # range 30s-1h  时间控制的检查点间隔。
max_wal_size = 320GB    #   2个检查点之间最多允许产生多少个XLOG文件
checkpoint_completion_target = 0.99     # checkpoint target duration, 0.0 - 1.0  ，平滑调度间隔，假设上一个检查点到现在这个检查点之间产生了100个XLOG，则这次检查点需要在产生100*checkpoint_completion_target个XLOG文件的过程中完成。PG会根据这些值来调度平滑检查点。
random_page_cost = 1.0     # same scale as above  , 离散扫描的成本因子，本例使用的SSD IO能力足够好
effective_cache_size = 240GB  # 可用的OS CACHE
log_destination = 'csvlog'  # Valid values are combinations of  
logging_collector = on          # Enable capturing of stderr and csvlog  
log_truncate_on_rotation = on           # If on, an existing log file with the  
update_process_title = off  
track_activities = off  
autovacuum = on    # Enable autovacuum subprocess?  'on'  
autovacuum_max_workers = 4 # max number of autovacuum subprocesses    ，允许同时有多少个垃圾回收工作进程。
autovacuum_naptime = 6s  # time between autovacuum runs   ， 自动垃圾回收探测进程的唤醒间隔
autovacuum_vacuum_cost_delay = 0    # default vacuum cost delay for  ， 垃圾回收不妥协

其他优化总结：
1. 尽量减少费的IO请求，所以本文从块设备，到逻辑卷，到文件系统的块大小都尽量和数据库块大小靠齐。
2. 通过对齐，减少IO覆盖写。
3. 通过大页减少内存管理开销。
4. 通过多个客户端将数据库硬件资源充分利用起来。
5. 减少客户端输出日志的开销，降低客户端性能干扰。
6. 使用新的编译器，优化编译后的可执行程序质量。

↧

ipad变白板，很好的在线视频讲课工具。

January 22, 2016, 3:19 am

≫ Next: CentOS 7.2 gpinitsystem error -Failed to complete obtain psql count Master gp_segment_configuration Script Exiting!

≪ Previous: PostgreSQL TPC-C极限优化玩法总结

在电脑中安装以下任意airserver软件。

https://annotate.net/

http://www.lonelyscreen.com/index.html

在IPAD中安装画图软件。

然后使用airplay把ipad设备的屏幕投放到安装的airserver。

ipad是不是秒变白板了？

以后录制公开课视频不需要备那么多PPT了，可以现场画画，比WINDOWS的画图工具好用太多。

↧

CentOS 7.2 gpinitsystem error -Failed to complete obtain psql count Master gp_segment_configuration Script Exiting!

January 23, 2016, 9:44 pm

≫ Next: PostgreSQL pg_basebackup 并行压缩备份

≪ Previous: ipad变白板，很好的在线视频讲课工具。

this version

commit 7065077b4fef5acf41e67b5240771db021d52156
Author: Shang Shujie <sshang@pivotal.io>
Date:   Wed Dec 16 14:53:34 2015 +0800

    fix bug: parquet can't write null out, with test case

when i use gpinitsystem init greenplum database.
ERROR.

[postgres@digoal ~]$ grep "^[a-Z]" gpinitsystem_config
ARRAY_NAME="EMC Greenplum DW"
SEG_PREFIX=gpseg
PORT_BASE=40000
declare -a DATA_DIRECTORY=(/data01/gpdata /data01/gpdata)
MASTER_HOSTNAME=digoal.com
MASTER_DIRECTORY=/data01/gpdata
MASTER_PORT=1999
TRUSTED_SHELL=ssh
CHECK_POINT_SEGMENTS=16
ENCODING=UTF-8
MACHINE_LIST_FILE=/home/postgres/host
[postgres@digoal ~]$ cat host
digoal.com
[postgres@digoal ~]$ ssh digoal.com date
Mon Dec 21 22:27:05 CST 2015

[postgres@digoal ~]$ gpinitsystem -c ./gpinitsystem_config --locale=C --max_connections=32 --shared_buffers=256MB --su_password=digoal -B 1

20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Checking new segment hosts, Completed
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Greenplum Database Creation Parameters
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:---------------------------------------
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master Configuration
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:---------------------------------------
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master instance name       = EMC Greenplum DW
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master hostname            = digoal.com
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master port                = 1999
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master instance dir        = /data01/gpdata/gpseg-1
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master LOCALE              = C
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Greenplum segment prefix   = gpseg
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master Database            = 
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master connections         = 32
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master buffers             = 256MB
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Segment connections        = 96
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Segment buffers            = 256MB
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Checkpoint segments        = 16
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Encoding                   = UTF-8
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Postgres param file        = Off
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Initdb to be used          = /opt/gpdb/bin/initdb
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-GP_LIBRARY_PATH is         = /opt/gpdb/lib
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Ulimit check               = Passed
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Array host connect type    = Single hostname per node
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master IP address [1]      = ::1
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master IP address [2]      = 192.168.0.157
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Master IP address [3]      = fe80::e2ca:94ff:fed5:c894
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Standby Master             = Not Configured
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Primary segment #          = 2
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Total Database segments    = 2
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Trusted shell              = ssh
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Number segment hosts       = 1
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Mirroring config           = OFF
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:----------------------------------------
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-Greenplum Primary Segment Configuration
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:----------------------------------------
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-digoal.com        /data01/gpdata/gpseg0   40000   2       0
20151221:22:22:01:015135 gpinitsystem:digoal:postgres-[INFO]:-digoal.com        /data01/gpdata/gpseg1   40001   3       1
Continue with Greenplum creation Yy/Nn>
y
20151221:22:22:02:015135 gpinitsystem:digoal:postgres-[INFO]:-Building the Master instance database, please wait...
20151221:22:22:17:015135 gpinitsystem:digoal:postgres-[INFO]:-Starting the Master in admin mode
20151221:22:22:31:gpinitsystem:digoal:postgres-[FATAL]:-Failed to complete obtain psql count Master gp_segment_configuration  Script Exiting!
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[WARN]:-Script has left Greenplum Database in an incomplete state
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[WARN]:-Run command /bin/bash /home/postgres/gpAdminLogs/backout_gpinitsystem_postgres_20151221_222157 to remove these changes
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function BACKOUT_COMMAND
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function BACKOUT_COMMAND

LOG file:

20151221:22:22:18:015135 gpinitsystem:digoal:postgres-[INFO]:-Completed starting the Master in admin mode
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function PING_HOST
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-digoal.com contact established
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function PING_HOST
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function UPDATE_GPCONFIG
tail: gpinitsystem_20151221.log: file truncated
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function ERROR_CHK
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Successfully completed obtain psql count Master gp_segment_configuration
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function ERROR_CHK
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Adding -1 on digoal.com /data01/gpdata/gpseg-1 to system configuration table
INSERT 0 1
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function ERROR_CHK
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Successfully completed add -1 on digoal.com to Master gp_segment_configuration
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function ERROR_CHK
20151221:22:22:25:015135 gpinitsystem:digoal:postgres-[INFO]:-Adding -1 on digoal.com /data01/gpdata/gpseg-1 to Master gp_filespace_entry
INSERT 0 1
20151221:22:22:26:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function ERROR_CHK
20151221:22:22:26:015135 gpinitsystem:digoal:postgres-[INFO]:-Successfully completed add -1 on digoal.com /data01/gpdata/gpseg-1 to Master pg_filespace_entry
20151221:22:22:26:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function ERROR_CHK
20151221:22:22:26:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function UPDATE_GPCONFIG
20151221:22:22:26:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function LOAD_QE_SYSTEM_DATA
20151221:22:22:26:015135 gpinitsystem:digoal:postgres-[INFO]:-Adding segment digoal.com to Master system tables
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function PING_HOST
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-digoal.com contact established
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function PING_HOST
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function UPDATE_GPCONFIG
tail: gpinitsystem_20151221.log: file truncated
psql: FATAL:  semctl(17530881, 14, SETVAL, 0) failed: Invalid argument (pg_sema.c:151)
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function ERROR_CHK
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function ERROR_CHK
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function ERROR_EXIT
20151221:22:22:31:gpinitsystem:digoal:postgres-[FATAL]:-Failed to complete obtain psql count Master gp_segment_configuration  Script Exiting!
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[WARN]:-Script has left Greenplum Database in an incomplete state
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[WARN]:-Run command /bin/bash /home/postgres/gpAdminLogs/backout_gpinitsystem_postgres_20151221_222157 to remove these changes
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-Start Function BACKOUT_COMMAND
20151221:22:22:31:015135 gpinitsystem:digoal:postgres-[INFO]:-End Function BACKOUT_COMMAND

code in bin/gpinitsystem

                U_DB=$DEFAULTDB
                CHK_COUNT=`env PGOPTIONS="-c gp_session_role=utility" $PSQL -p $MASTER_PORT -d "$U_DB" -A -t -c "SELECT count(*) FROM $GP_CONFIG_TBL WHERE content=${U_CONTENT} AND preferred_role='${U_ROLE}';" 2>$LOG_FILE` >> $LOG_FILE 2>&1
                ERROR_CHK $? "obtain psql count Master $GP_CONFIG_TBL" 2
                if [ $CHK_COUNT -eq 0 ]; then
                                LOG_MSG "[INFO]:-Adding $U_CONTENT on $U_HOSTNAME $U_DIR to system configuration table"
                                env PGOPTIONS="-c gp_session_role=utility" $PSQL -p $MASTER_PORT -d "$U_DB" -c "INSERT INTO $GP_CONFIG_TBL (dbid, content, role, preferred_role, mode, status, hostname, address, port, replication_port) VALUES (${U_DBID}, ${U_CONTENT}, '${U_ROLE}', '${U_ROLE}', 's', 'u', '${U_HOSTNAME}', '${U_ADDRESS}', ${U_PORT}, ${U_REPLICATION_PORT});" >> $LOG_FILE 2>&1
                                ERROR_CHK $? "add $U_CONTENT on $U_HOSTNAME to Master gp_segment_configuration" 2

解决方法：

Later versions of systemd will clear out semaphores on user logout by default, which can interfere with gpinitsystem.

Our solution to get Greenplum to install on CentOS 7.2 is to run the following as root before running gpinitsystem:

echo "RemoveIPC=no" >> /etc/systemd/logind.conf
service systemd-logind restart

↧

PostgreSQL pg_basebackup 并行压缩备份

January 24, 2016, 7:41 am

≫ Next: Greenplum PostgreSQL --enable-profiling 产生gprof性能诊断代码

≪ Previous: CentOS 7.2 gpinitsystem error -Failed to complete obtain psql count Master gp_segment_configuration Script Exiting!

在使用pg_basebackup备份数据库时，可以选择压缩或不压缩。

当我们使用非压缩格式时，如果你的块设备性能很不错，这种单进程的拷贝方式可能无法将块设备的性能发挥出来，例如现在企业级的SSD可以达到2GB/s的读写性能。而使用单进程拷贝小文件可能只能达到300MB/s，这对于备份一个比较大的数据库来说，速度是远远不够的。如果能把SSD的性能发挥出来，速度可以提升6倍。

所以pg_basebackup是不是能支持并行呢？

现在社区版本并没有提供并行备份的功能，我们得想想其他方法。

我们首先来看看pg_basebackup的工作原理：

创建检查点，打开FPW，创建备份标签（存储检查点位置，时间等信息），通过流复制协议与数据库建立连接，WAL Sender进程向pg_basebackup发送数据库物理文件，pg_basebackup接收到文件后写入目标位置（压缩或不压缩）。

这里有几个地方可能成为瓶颈：

1. wal sender端，单进程，可能慢(实际还好)。

pg_basebackup -F t -D - >/dev/null

测试达到1.2 GB/s

（块设备 blockdev --setra 16384 /dev/...）

此时块设备的UTIL为66%，说明速度还可以上来。需要并行。

2. 网络，单线程拷贝，可能成为瓶颈。

10GB 网卡, 大包速度可以达到 1.25 GB/s

3. pg_basebackup端，写入数据时可能成为瓶颈。

pg_basebackup -F t -D - > /dege.zzz/backup/tar

测试达到300 MB/s

是本文的瓶颈。本文将使用并行压缩来减轻这里的写瓶颈，或者升级块设备，使用写入性能更好的块设备。

PostgreSQL除了 pg_basebackup来备份，还可以使用这种方法：

1. pg_start_backup('test');

2 .copy files;

3. pg_stop_backup();

如果使用这种方法，我们能找到提升的点吗?

默认拷贝文件使用的是cp命令。或者tar压缩整个目录(需要注意表空间的软链接)。

CP或tar有没有可以并行的呢？当然可以。

这里有几个参考：

http://www.zlib.net/pigz/

https://en.wikipedia.org/wiki/Bzip2

http://lbzip2.org/quickstart

http://compression.ca/pbzip2/

用一个1TB的数据库进行测试：

清除缓存

sync; echo 3 > /proc/sys/vm/drop_caches

使用pg_basebackup -F p 即不压缩，速度是269MB/s

使用pg_basebackup -F c -z 即归档且压缩模式，速度小于269MB/s

使用pg_basebackup -F c 归档不压缩模式，速度269MB/s

使用cp ，不压缩，速度 399MB/s

使用cp ，并行压缩，以pigz为例：

date +%F%T; tar -cf /digoal/backup/test.tar.gz --use-compress-prog=pigz /digoal/pgdata; date +%F%T;

速度可以达到 509MB/s，但是很耗CPU。

接下来要做的就是把pg_basebackup原来使用的gzip替换成pigz。

使用管道即可。

pg_basebackup -F t -D - | /digoal/pigz-2.3.3/pigz -6 -p 32 > /digoal/backup/test1.tar.gz

速度，428MB/s ，但是很耗CPU。

[附]

不压缩，流式备份：

pg_basebackup -D /digoal/backup -F p

备份结果集大小 819 GB

备份速度 269 MB/s

磁盘使用率 51%

CPU使用率 1核

并行压缩，流式备份：

pg_basebackup -F t -D - | /digoal/pigz-2.3.3/pigz -6 -p 24 > /digoal/backup/test1.tar.gz

备份结果集大小 46 GB

备份速度 428 MB/s

磁盘使用率 28.6%

CPU使用率 24核(可配置)

不压缩，拷贝备份：

(备份大小819GB, 399MB/s)

cp -r $PGDATA/ /digoal/backup/

备份结果集大小 819 GB

备份速度 399 MB/s

磁盘使用率 44.8%

CPU使用率 1核

并行压缩，拷贝备份

tar -cf /digoal/backup/test.tar.gz --use-compress-prog=pigz /digoal/pgdata

备份结果集大小 46 GB

备份速度 509 MB/s

磁盘使用率 33.8%

CPU使用率 24核(可配置)

↧

Greenplum PostgreSQL --enable-profiling 产生gprof性能诊断代码

January 24, 2016, 10:56 pm

≫ Next: PostgreSQL修改表结构后，EXECUTE PS报错cached plan must not change result type

≪ Previous: PostgreSQL pg_basebackup 并行压缩备份

Greenplum或PostgreSQL的编译配置文件中有一个选项是 --enable-profiling，这个选项会打开GCC的-pg参数。产生可以被gprof用于分析进程的代码。

如下：

less configure

# enable profiling if --enable-profiling

if test "$enable_profiling" = yes && test "$ac_cv_prog_cc_g" = yes; then

if test "$GCC" = yes; then

$as_echo "#define PROFILE_PID_DIR 1" >>confdefs.h

CFLAGS="$CFLAGS -pg $PLATFORM_PROFILE_FLAGS"

else

as_fn_error $? "--enable-profiling is supported only when using GCC" "$LINENO" 5

less src/template/linux

# If --enable-profiling is specified, we need -DLINUX_PROFILE

PLATFORM_PROFILE_FLAGS="-DLINUX_PROFILE"

man gcc

-pg Generate extra code to write profile information suitable for the analysis program gprof. You must use this option when compiling the source files you want data about, and you must also use it when linking.

诊断代码被放在$PGDATA/gprof目录中，每个进程都会产生一个子目录，里面会有一个gmon.out文件。

查看这个文件的信息：

#gprof -b /home/digoal/gpdb/bin/postgres gmon.out

Flat profile:

Each sample counts as 0.01 seconds.

% cumulative self self total

time seconds seconds calls ms/call ms/call name

80.00 0.04 0.04 91 0.44 0.44 FileRepVerify_ComputeFileHash

20.00 0.05 0.01 1 10.00 10.00 ChangeTracking_WriteBuffer

0.00 0.05 0.00 750562 0.00 0.00 ChangeTracking_GetRelationChangeInfoFromXlog

0.00 0.05 0.00 670756 0.00 0.00 cdbpullup_colIdx

0.00 0.05 0.00 253200 0.00 0.00 cdbpullup_isExprCoveredByTargetlist

0.00 0.05 0.00 239413 0.00 0.00 cdbpullup_targetlist

......

有点类似oprofile的输出。

gprof的用法可以参考man gprof。

↧

PostgreSQL修改表结构后，EXECUTE PS报错cached plan must not change result type

January 25, 2016, 7:19 pm

≫ Next: sshpass

≪ Previous: Greenplum PostgreSQL --enable-profiling 产生gprof性能诊断代码

在使用绑定变量时，如果使用*进行绑定，在修改表结构后，会导致cached plan must not change result type的错误。

在使用数据库时，不要随意使用*，又一个很好的理由。

测试如下：

postgres=# create table t(p1 int, p2 int);

CREATE TABLE

postgres=# prepare a as insert into t values ($1,$2);

PREPARE

postgres=# execute a(1,1);

INSERT 0 1

postgres=# alter table t add column p3 int default 1;

ALTER TABLE

postgres=# execute a(1,1);

INSERT 0 1

postgres=# execute a(1,1);

INSERT 0 1

postgres=# prepare a1 as insert into t values ($1,$2) returning *;

PREPARE

postgres=# execute a(1,1);

INSERT 0 1

postgres=# execute a1(1,1);

p1 | p2 | p3

----+----+----

1 | 1 | 1

(1 row)

INSERT 0 1

postgres=# alter table t add column p4 int default 1;

ALTER TABLE

postgres=# execute a1(1,1);

ERROR: cached plan must not change result type

postgres=# prepare a2 as insert into t values ($1,$2) returning p1,p2;

PREPARE

postgres=# execute a2(1,1);

p1 | p2

----+----

1 | 1

(1 row)

INSERT 0 1

postgres=# alter table t add column p5 int default 1;

ALTER TABLE

postgres=# execute a2(1,1);

p1 | p2

----+----

1 | 1

(1 row)

INSERT 0 1

修改类型，如果PS使用了被修改的自动，执行时也会导致同样的错误。

postgres=# prepare a2 as insert into t values ($1,$2) returning p1,p2;

PREPARE

postgres=# execute a2(1,2);

p1 | p2

----+----

1 | 2

(1 row)

INSERT 0 1

postgres=# alter table t alter column p1 type int8;

ALTER TABLE

postgres=# execute a2(1,2);

ERROR: cached plan must not change result type

sshpass

January 26, 2016, 1:23 am

≫ Next: 查看Greenplum和PostgreSQL 发行版本的编译参数

≪ Previous: PostgreSQL修改表结构后，EXECUTE PS报错cached plan must not change result type

可以在命令行中提供密码的ssh登陆工具.

http://sourceforge.net/projects/sshpass/

↧

查看Greenplum和PostgreSQL 发行版本的编译参数

January 26, 2016, 1:55 am

≫ Next: PostgreSQL 非主键非唯一约束列，如何使用advisory lock保证唯一性

≪ Previous: sshpass

通过pg_config可以看到PG的一些库目录，编译器，预编译FLAG，等等：

BINDIR = /home/digoal/pgsql9.5/bin

DOCDIR = /home/digoal/pgsql9.5/share/doc

HTMLDIR = /home/digoal/pgsql9.5/share/doc

INCLUDEDIR = /home/digoal/pgsql9.5/include

PKGINCLUDEDIR = /home/digoal/pgsql9.5/include

INCLUDEDIR-SERVER = /home/digoal/pgsql9.5/include/server

LIBDIR = /home/digoal/pgsql9.5/lib

PKGLIBDIR = /home/digoal/pgsql9.5/lib

LOCALEDIR = /home/digoal/pgsql9.5/share/locale

MANDIR = /home/digoal/pgsql9.5/share/man

SHAREDIR = /home/digoal/pgsql9.5/share

SYSCONFDIR = /home/digoal/pgsql9.5/etc

PGXS = /home/digoal/pgsql9.5/lib/pgxs/src/makefiles/pgxs.mk

CONFIGURE = '--prefix=/home/digoal/pgsql9.5'

CC = gcc

CPPFLAGS = -D_GNU_SOURCE

CFLAGS = -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -O2

CFLAGS_SL = -fpic

LDFLAGS = -L../../../src/common -Wl,--as-needed -Wl,-rpath,'/home/digoal/pgsql9.5/lib',--enable-new-dtags

LDFLAGS_EX =

LDFLAGS_SL =

LIBS = -lpgcommon -lpgport -lz -lreadline -lrt -lcrypt -ldl -lm

VERSION = PostgreSQL 9.5.0

但是configure的配置项是看不到的。

怎么看呢？

1. 在源码的config.log中可以看到。

less $PGSRC/config.log

This file contains any messages produced by compilers while

running configure, to aid debugging if configure makes a mistake.

It was created by PostgreSQL configure 9.5.0, which was

generated by GNU Autoconf 2.69. Invocation command line was

$ ./configure --prefix=/home/digoal/pgsql9.5

2. 如果源码遗失了，可以在软件目录中看到。

$PGHOME/lib/pgxs/src/Makefile.global

# Saved arguments from configure

configure_args = '--prefix=/home/digoal/pgsql9.5'

greenplum的发行版本编译项

$GPHOME/lib/postgresql/pgxs/src/Makefile.global

# Saved arguments from configure

configure_args = '--enable-snmp' '--enable-connectemc' '--enable-ddboost' '--with-gssapi' '--enable-netbackup' '--enable-orca' '--with-libxml' '--with-pgport=5432' '--with-libedit-preferred' '--with-perl' '--with-python' '--with-java' '--with-includes=/data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/src/ext/rhel5_x86_64/include /data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/src/ext/rhel5_x86_64/include/libxml2' '--with-libraries=/data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/src/ext/rhel5_x86_64/lib' '--with-openssl' '--with-pam' '--with-krb5' '--with-ldap' 'CURL_CONFIG=/data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/src/ext/rhel5_x86_64/bin/curl-config' '--prefix=/data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/4.3.6.1-build-2_output/greenplum-db-4.3.6.1-build-2' '--with-docdir=/data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/4.3.6.1-build-2_output/greenplum-db-4.3.6.1-build-2/doc' '--mandir=/data/pulse2-agent/agents/agent1/work/GPDB-4_3_x-rcbuilds/rhel5_x86_64/4.3.6.1-build-2_output/greenplum-db-4.3.6.1-build-2/man' 'CC=gcc -m64' 'CFLAGS=-O3 -funroll-loops -fargument-noalias-global -fno-omit-frame-pointer -g -finline-limit=1800'

通过这个你可以拿到edb或greenplum的发行版本，看看他们的编译参数是什么样的。

↧

PostgreSQL 非主键非唯一约束列，如何使用advisory lock保证唯一性

January 28, 2016, 5:57 pm

≫ Next: PostgreSQL 子事务 id & command id 溢出问题分析

≪ Previous: 查看Greenplum和PostgreSQL 发行版本的编译参数

在没有唯一约束或者主键约束时，数据库是不保证唯一性的。那么有什么手段来保证呢？

方法 1. 串行操作，先查询，如果没有查到记录，则插入。

这种方法效率非常低：

测试如下：

postgres=# create table tbl(c1 text);

CREATE TABLE

postgres=# create index idx_c1 on tbl(c1);

CREATE INDEX

postgres=# create or replace function load(v_c1 text) returns void as $$

declare

begin

perform 1 from tbl where c1=v_c1 limit 1;

if found then

return;

else

insert into tbl(c1) values (v_c1);

end if;

end;

$$ language plpgsql strict;

CREATE FUNCTION

压测：

vi test.sql

\setrandom c1 1 50000

select load(:c1);

pgbench -M prepared -n -r -P 1 -f test.sql -c 1 -j 1 -T 100

性能分析，由于以上方法只能在串行模式下保证C1字段的唯一性，如果是并行模式，无法保证唯一性。所以性能完全仰仗load函数的RT，有效插入性能差，无效插入则依赖查询的RT，性能相对较好。

在60秒的时候，数据已经满5万了，所以都变成了无效插入，即查询后直接return。

progress: 59.0 s, 375.0 tps, lat 2.673 ms stddev 1.113

progress: 60.0 s, 368.0 tps, lat 2.713 ms stddev 1.110

progress: 61.0 s, 5787.1 tps, lat 0.172 ms stddev 0.521

progress: 62.0 s, 12538.1 tps, lat 0.079 ms stddev 0.012

progress: 63.0 s, 12802.2 tps, lat 0.077 ms stddev 0.011

验证约束准确性：

postgres=# select count(*),count(distinct c1) from tbl;

count | count

---------+---------

50000 | 50000

(1 row)

性能趋势：

PostgreSQL 非主键非唯一约束列，如何保证唯一性 - 德哥@Digoal - PostgreSQL research

方法2.

和秒杀场景的方法一样（PostgreSQL秒杀一条记录能达到23万的qps。）：

http://blog.163.com/digoal@126/blog/static/16387704020158149538415/

本例一样使用advisory lock，当锁冲突时，并行会话变串行会话，其他无冲突会话都是并行执行的。

我们来看看性能提升多少？

postgres=# create or replace function load(v_c1 text) returns void as $$

declare

begin

perform 1 from tbl where c1=v_c1 limit 1;

if found then

return;

end if;

if ( pg_try_advisory_xact_lock(hashtext(v_c1)) ) then

perform 1 from tbl where c1=v_c1 limit 1;

if not found then

insert into tbl(c1) values (v_c1);

else

return;

end if;

else

return;

end if;

end;

$$ language plpgsql strict;

压测500万唯一值。

vi test.sql

\setrandom c1 1 5000000

select load(:c1);

52个并发：

pgbench -M prepared -n -r -P 1 -f test.sql -c 52 -j 52 -T 100

越来越快，因为无效插入越来越多。如果全变成无效插入，理论上qps也是能达到20万以上的。

progress: 96.0 s, 161872.6 tps, lat 0.319 ms stddev 0.429

progress: 97.0 s, 161766.4 tps, lat 0.319 ms stddev 0.387

progress: 98.0 s, 164232.7 tps, lat 0.315 ms stddev 0.419

progress: 99.0 s, 165476.5 tps, lat 0.312 ms stddev 0.405

progress: 100.0 s, 166866.0 tps, lat 0.309 ms stddev 0.410

transaction type: Custom query

scaling factor: 1

query mode: prepared

number of clients: 52

number of threads: 52

duration: 100 s

number of transactions actually processed: 12510348

latency average: 0.414 ms

latency stddev: 0.450 ms

tps = 125034.429736 (including connections establishing)

tps = 125043.765999 (excluding connections establishing)

statement latencies in milliseconds:

0.003204 \setrandom c1 1 5000000

0.410254 select load(:c1);

验证结果，并发唯一，bingo。

postgres=# select count(*),count(distinct c1) from tbl;

count | count

---------+---------

4593181 | 4593181

(1 row)

性能趋势：

应用场景举例

比如

tbl

(internal_id int serial8 primary key, nick_name text unique, ......)

internal_id 对应的是内部使用的唯一ID

nick_name 是用户的唯一ID，也是唯一的。

用户会输入一个nick_name，通过序列生成内部ID。

如果一开始就有这样的约束，问题就不存在。

但是当以前使用的数据没有加唯一约束，然后已经有重复值产生了。

现在想把唯一约束建立起来，首先要去重复，然后建立唯一约束。

在建立唯一约束前，如果用户还有数据不断录入，并且需要不中断业务的情况下去重复和建立约束的话，有什么好办法呢？

办法：

先不管历史值，新进来的值使用以上方法保证唯一。

然后去重

然后并行添加唯一索引。

代码举例：

postgres=# drop table tbl;

DROP TABLE

postgres=# create table tbl(internal_id serial8, nick_name text);

CREATE TABLE

postgres=# create index idx_tbl_nickname on tbl(nick_name);

CREATE INDEX

postgres=# drop function load(text);

DROP FUNCTION

postgres=# create or replace function load(v_c1 text) returns int8 as $$

declare

i int8;

begin

select internal_id into i from tbl where nick_name = v_c1 limit 1;

if found then

return i;

end if;

LOOP

if ( pg_try_advisory_xact_lock(hashtext(v_c1)) ) then

select internal_id into i from tbl where nick_name = v_c1 limit 1;

if not found then

insert into tbl(nick_name ) values (v_c1) returning internal_id into i ;

return i;

else

return i;

end if;

end loop;

end;

$$ language plpgsql strict;

压测500万唯一值。

vi test.sql

\setrandom c1 1 5000000

select load(:c1);

52个并发：

pgbench -M prepared -n -r -P 1 -f test.sql -c 52 -j 52 -T 100

验证数据唯一性：

postgres=# select count(*),count(distinct nick_name) from tbl;

count | count

---------+---------

3966568 | 3966568

(1 row)

性能趋势：

[参考]

1. http://blog.163.com/digoal@126/blog/static/16387704020158149538415/

↧

PostgreSQL 子事务 id & command id 溢出问题分析

February 2, 2016, 4:01 am

≫ Next: EnterpriseDB & PostgreSQL RLS & Oracle VPD

≪ Previous: PostgreSQL 非主键非唯一约束列，如何使用advisory lock保证唯一性

PostgreSQL 需要为每个savepoint或者函数的exception section分配子事务号，递增。

即使这个exception没有被触发，也需要一个子事务号。

PushTransaction@src/backend/access/transam/xact.c

* Assign a subtransaction ID, watching out for counter wraparound.

currentSubTransactionId += 1;

if (currentSubTransactionId == InvalidSubTransactionId)

{

currentSubTransactionId -= 1;

pfree(s);

ereport(ERROR,

(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

errmsg("cannot have more than 2^32-1 subtransactions in a transaction")));

}

command id则是记录一个事务中产生写操作(例如ddl,dml)的SQL ID，递增。

CommandCounterIncrement@src/backend/access/transam/xact.c

if (currentCommandIdUsed)

{

currentCommandId += 1;

if (currentCommandId == InvalidCommandId)

{

currentCommandId -= 1;

ereport(ERROR,

(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

errmsg("cannot have more than 2^32-2 commands in a transaction")));

}

currentCommandIdUsed = false;

子事务 id和command id都是unsigned int类型，最大允许分配2^32-1个子事务，单个事务中最大允许分配2^32-2条COMMAND。

typedef uint32 SubTransactionId;

typedef uint32 CommandId;

子事务什么情况下可能溢出呢？

1. 在事务中累计使用的savepoint = n。

2. 在事务中有exception的函数,每个exception需要申请一个子事务，如果函数被多次调用，则需要计算多个子事务。假设函数exception需要的子事务个数=m。

如果n+m大于2^32-1，溢出。

command id什么情况下可能溢出呢？

一个事务中，包含的ddl,dml SQL超过2^32-2时。

跟踪方法：

currentCommandId += 1;

// 添加如下

ereport(NOTICE,

(errmsg("currentCommandId: %d", currentCommandId)));

if (currentCommandId == InvalidCommandId)

{

currentCommandId -= 1;

ereport(ERROR,

(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

errmsg("cannot have more than 2^32-2 commands in a transaction")));

}

currentCommandIdUsed = false;

...

* Assign a subtransaction ID, watching out for counter wraparound.

currentSubTransactionId += 1;

// 添加如下

ereport(NOTICE,

(errmsg("currentSubTransactionId: %d", currentSubTransactionId)));

if (currentSubTransactionId == InvalidSubTransactionId)

{

currentSubTransactionId -= 1;

pfree(s);

ereport(ERROR,

(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

errmsg("cannot have more than 2^32-1 subtransactions in a transaction")));

}

重新编译安装，重启数据库。

psql

设置notice消息级别

postgres=# set client_min_messages='notice';

SET

创建测试函数

postgres=# create or replace function f() returns void as $$

declare

begin

exception

when others then

raise exception 'a';

end;

$$ language plpgsql;

测试子事务号申请。

postgres=# select f();

NOTICE: currentSubTransactionId: 2

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

---

(1 row)

volatile函数，每条tuple都会触发调用

postgres=# select f() from generate_series(1,10);

NOTICE: currentSubTransactionId: 2

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 3

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 4

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 5

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 6

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 7

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 8

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 9

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 10

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 11

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

没有exception的话，不会产生子事务。

postgres=# select * from generate_series(1,10);

generate_series

-----------------

(10 rows)

postgres=# create or replace function f1() returns void as $$

postgres$# declare

postgres$# begin

postgres$# end;

postgres$# $$ language plpgsql;

NOTICE: currentCommandId: 1

CREATE FUNCTION

postgres=# select f1() from generate_series(1,10);

----

(10 rows)

接下来跟踪一下command id:

DDL,DML会产生command

postgres=# create table t(id int);

NOTICE: currentCommandId: 1

CREATE TABLE

postgres=# insert into t values (1);

NOTICE: currentCommandId: 2

INSERT 0 1

postgres=# insert into t values (1);

NOTICE: currentCommandId: 3

INSERT 0 1

查询不需要分配command id

postgres=# select 1;

?column?

----------

(1 row)

savepoint 产生子事务

postgres=# savepoint a;

NOTICE: currentSubTransactionId: 12

SAVEPOINT

postgres=# savepoint a;

NOTICE: currentSubTransactionId: 13

SAVEPOINT

postgres=# savepoint a;

NOTICE: currentSubTransactionId: 14

SAVEPOINT

postgres=# savepoint a;

NOTICE: currentSubTransactionId: 15

SAVEPOINT

rollback to savepoint 产生子事务

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 16

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 17

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 18

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 19

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 20

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 21

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 22

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 23

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 24

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 25

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 26

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 27

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 28

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 29

ROLLBACK

postgres=# rollback to savepoint a;

NOTICE: currentSubTransactionId: 30

ROLLBACK

postgres=# end;

COMMIT

没有exception的函数不产生子事务：

postgres=# create or replace function f() returns void as $$

declare

begin

end;

$$ language plpgsql;

NOTICE: currentCommandId: 1

CREATE FUNCTION

postgres=# select f();

---

(1 row)

每个exception都需要分配一个子事务：

create or replace function f() returns void as $$

declare

begin

exception when others then

return;

end;

begin

exception when others then

return;

end;

exception when others then

return;

end;

$$ language plpgsql;

postgres=# select f();

NOTICE: currentSubTransactionId: 2

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 3

CONTEXT: PL/pgSQL function f() line 6 during statement block entry

NOTICE: currentSubTransactionId: 4

CONTEXT: PL/pgSQL function f() line 11 during statement block entry

---

(1 row)

溢出的例子：

postgres=# select count(*) from (select f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f(),f() from generate_series(1,500000000))t;

ERROR: cannot have more than 2^32-1 subtransactions in a transaction

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

顺带讲一下函数稳定性，以前写过分享。

stable和volatile在一条SQL中，每条tuple都会被触发（实际上stable当传参一样时，不应该被多次触发，这是PG的一个问题）。

immutable则在任何情况下都只调用一次，和stable区别还有，在使用绑定变量时，immutable会自动转换成常量。

postgres=# alter function f() immutable;

ALTER FUNCTION

仅仅触发一次

postgres=# select f() from generate_series(1,100);

NOTICE: currentSubTransactionId: 2

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 3

CONTEXT: PL/pgSQL function f() line 6 during statement block entry

NOTICE: currentSubTransactionId: 4

CONTEXT: PL/pgSQL function f() line 11 during statement block entry

---

改为stable触发多次

postgres=# alter function f() stable;

ALTER FUNCTION

postgres=# select f() from generate_series(1,100);

NOTICE: currentSubTransactionId: 2

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 3

CONTEXT: PL/pgSQL function f() line 6 during statement block entry

NOTICE: currentSubTransactionId: 4

CONTEXT: PL/pgSQL function f() line 11 during statement block entry

NOTICE: currentSubTransactionId: 5

CONTEXT: PL/pgSQL function f() line 3 during statement block entry

NOTICE: currentSubTransactionId: 6

,,,,,,

↧

EnterpriseDB & PostgreSQL RLS & Oracle VPD

February 2, 2016, 8:01 am

≫ Next: PostgreSQL Oracle兼容性之 - 函数自治事务的实现

≪ Previous: PostgreSQL 子事务 id & command id 溢出问题分析

PostgreSQL 9.5的RLS用法请参照

http://blog.163.com/digoal@126/blog/static/16387704020153984016177/

EnterpriseDB的RLS用法略有差别，因为EDB主要为Oracle兼容性做了很多适配，用法和Oracle的VPD用法相似，调用DBMS_RLS.add_policy来实现RLS。

详细的用法参照：

http://www.enterprisedb.com/docs/en/9.5/oracompat/Database_Compatibility_for_Oracle_Developers_Guide.1.201.html#pID0E0D5J0HA

例子：

postgres=> create table test(id int, info text, rol name);

创建一个策略，只允许用户操作(select,update,delete,insert) rol=当前用户名的记录。

首先要创建一个函数，函数的参数类型和返回值类型必须使用这种格式。

代表行安全策略要在哪个schema.object对象上应用，以及应用时添加的条件。

函数返回值就是添加的条件。

例如'rol='||current_user这个会作为附加条件，判断记录是否匹配这个条件。

postgres=> create or replace function f(p_schema text, p_obj text) returns text as $$

declare

begin

return 'rol=$_$'||current_user||'$_$';

end;

$$ language plpgsql;

CREATE FUNCTION

创建策略(需超级用户)：

postgres=> select

dbms_rls.add_policy (object_schema => 'public'::text

,object_name => 'test'::text

,policy_name => 'policy1'::text

,function_schema => 'public'::text

,STATEMENT_TYPES => 'SELECT, INSERT, UPDATE, DELETE'::text

,POLICY_FUNCTION => 'f'::text, update_check=>'true');

rds_add_policy

----------------

(1 row)

验证策略是否生效

postgres=> select current_user;

current_user

--------------

digoal

(1 row)

当前用户为digoal，只能插入rol='digoal'的记录

postgres=> insert into test values (1,'test','digoal');

INSERT 16426 1

postgres=> insert into test values (1,'test','A');

ERROR: policy with check option violation

DETAIL: Policy predicate was evaluated to FALSE with the updated values

更换一个名为test的用户测试：

postgres=> insert into test values (1,'test','a');

ERROR: policy with check option violation

DETAIL: Policy predicate was evaluated to FALSE with the updated values

postgres=> insert into test values (1,'test','test');

INSERT 16428 1

postgres=> select current_user;

current_user

--------------

test

(1 row)

postgres=> update test set id=1;

UPDATE 1

postgres=> update test set id=1 returning *;

id | info | rol

----+------+------

1 | test | test

(1 row)

UPDATE 1

这样做到了数据的隔离。

查看已有的策略：

postgres=# select * from dba_policies ;

--------------+-------------+-------------+--------------+--------------------+----------+---------+----------+-----+-----+-----+-----+-----+------------+--------+---------------+-------------+----------------

digoal | public | test | | policy1 | public | | f | YES | YES | YES | YES | NO | YES | YES | NO | UNKNOWN | YES

禁用,生效策略

postgres=# select dbms_rls.enable_policy('public','test','policy1',false); -- 禁用

postgres=# select dbms_rls.enable_policy('public','test','policy1',true); -- 生效

删除策略

postgres=# select dbms_rls.drop_policy('public','test','policy1');

-[ RECORD 1 ]-

drop_policy |

注意EDB的策略只能针对所有用户，不能像PG那样只对某些用户创建策略。

如果要让EDB的策略有针对性，可以在函数中加上角色判断，对不同的角色使用不同的返回值来控制策略的多样性。

例如

switch current_user

when 'digoal' then return '.....';

when '..' then return '...';

等。

策略还有一个需要注意的地方，静态和动态。通过add_policy的参数控制，默认是动态。

静态策略指会话中第一次触发策略时，编译成静态的，以后直接使用内存中缓存的策略。

动态策略指每次都重新调用。例如使用current_user这种变量作为return值中的一部分，就需要使用动态策略。

权限问题：

dbms_rls包需要超级用户才能执行，如果需要给普通用户执行权限，可以通过封装，或者将函数的执行权限给普通用户。

例如：

create or replace function your_add_policy(object_schema text DEFAULT NULL::text, object_name text, policy_name text, function_schema text DEFAULT NULL::text, policy_function text, statement_types text DEFAULT 'insert,update,delete,select'::text, update_check boolean DEFAULT false, enable boolean DEFAULT true, static_policy boolean DEFAULT false, policy_type integer DEFAULT NULL::integer, long_predicate boolean DEFAULT false, sec_relevant_cols text DEFAULT NULL::text, sec_relevant_cols_opt integer DEFAULT NULL::integer) returns void as $$

declare

begin

perform dbms_rls.add_policy(object_schema , object_name , policy_name , function_schema , policy_function , statement_types , update_check , enable , static_policy , policy_type , long_predicate , sec_relevant_cols , sec_relevant_cols_opt);

end;

$$ language plpgsql;

grant execute on function your_add_policy( text , text, text, text , text, text , boolean , boolean , boolean , integer , boolean , text , integer ) to public;

风险：

对超级用户创建的表，普通用户也能通过add_policy来控制安全策略，这个控制结果有点越权的感觉。

例如用户创建一个函数如下：

create or replace function f2(name,name) returns text as $$

declare

begin

return 'false';

end;

$$ language plpgsql;

然后把这个函数作为策略函数，在一个超级用户创建的表上创建策略，超级用户对这个表的内容就会变成完全不可见状态。

是非常危险的。不过还好对系统表不起作用，否则问题更严重。

安全加固方法：

在封装函数中过滤需要过滤的表，对这些表不允许创建policy：

declare

filter_name text[];

begin

filter_name = array['public.tbl1']; -- 过滤这个表

perform 1 where object_schema||'.'||object_name = any(filter_name);

if not found then

end if;

end;

$$ language plpgsql security definer;

grant execute on function your_add_policy( text , text, text, text , text, text , boolean , boolean , boolean , integer , boolean , text , integer ) to public;

用户再次使用your_add_policy添加策略时，对public.tbl1不起作用。

[参考]

1. http://blog.163.com/digoal@126/blog/static/16387704020153984016177/

2. http://www.enterprisedb.com/docs/en/9.5/oracompat/Database_Compatibility_for_Oracle_Developers_Guide.1.201.html#pID0E0D5J0HA

3. http://www.postgresql.org/docs/9.5/static/sql-createpolicy.html

↧

PostgreSQL Oracle兼容性之 - 函数自治事务的实现

February 3, 2016, 5:08 am

≫ Next: PostgreSQL Oracle 兼容性之 - WM_SYS.WM_CONCAT

≪ Previous: EnterpriseDB & PostgreSQL RLS & Oracle VPD

使用Oracle的用户，在函数中如果使用了自治事务的话，如果要转到PostgreSQL会遇到很棘手的问题。
因为PostgreSQL的函数是作为一个事务来处理的，要么全部提交，要么全部回滚，除了exception，每个exception是一个子事务。
使用并行或嵌套block和exception可以达到自治事务的目的。

建议使用例子2的用法。

例子1 :

单个block的用法：
你可以将原来函数中的block拆解成一个个小的函数，当然不拆解成小函数也可以的，就是这里的代码量会更大一点。
例如：

declare
  trace int;
  其他变量定义;
begin
-- block 1，需要流转的变量通过参数传入下面的函数
  select func1(...) into ...;
  trace = 1;  -- 说明block 1的部分已经成功执行了。
-- block 2，需要流转的变量通过参数传入下面的函数
  select func2(...) into ...;
  trace = 2;  -- 说明block 2的部分已经成功执行了。
......
-- block n，需要流转的变量通过参数传入下面的函数
  select funcn(...) into ...;
  trace = n;  -- 说明block n的部分已经成功执行了。
......
-- 如果在接下来的block执行失败，会跳到异常处理部分，在异常处理中，通过trace的值来判断哪些是可以成功执行的。

exception when others then
  case trace
    case when 1 then
      call func1(...);
    case when 2 then
      call func1(...);
      call func2(...);
......
    case when n then
      call funcn(...);
......
end;

这个方法并不能cover所有的场景，因为在exception执行的时候，还是可能出错的。如果exception出错，那么后面的代码就不能被执行。

例子2，

并行加嵌套block。

演示函数的输入参数为block1, block2.1, block2.2, block3.1 。

这些参数代表执行在哪个block出错，出错时对应层级的block的exception会捕获错误，同时处理，然后跳到下一个block继续执行。

如果是外层的block出错，内层还没有被执行的block就没机会执行了。

根据业务需求，调整block层级或嵌套层级，达到目的。

这种用法可以完美的支撑业务的需求。

（除了一种情况不能满足，就是被提交的子事务立刻可以被其他事务可见。这种需求建本文下面的方法，用dblink来满足这种需求即可。）

create or replace function ft(err_level text) returns void as $$

declare

begin -- block level 1

raise notice 'block level 1';

if (err_level='block1') then

raise exception '%', err_level;

end if;

begin -- block level 2.1

raise notice 'block level 2.1'; -- 请用业务处理SQL代替

if (err_level='block2.1') then

raise exception '%', err_level;

end if;

begin -- block level 3.1

raise notice 'block level 3.1';

if (err_level='block3.1') then

raise exception '%', err_level;

end if;

exception when others then -- you can write catchup any ERROR CODE or ERROR STATE.

raise notice 'end block level 3.1';

end; -- end block level 3.1

exception when others then -- you can write catchup any ERROR CODE or ERROR STATE. 回滚block 2.1的业务处理SQL

raise notice 'end block level 2.1';

end; -- end block level 2.1

begin -- block level 2.2

raise notice 'block level 2.2';

if (err_level='block2.2') then

raise exception '%', err_level;

end if;

exception when others then -- you can write catchup any ERROR CODE or ERROR STATE.

raise notice 'end block level 2.2';

end; -- end block level 2.2

exception when others then -- you can write catchup any ERROR CODE or ERROR STATE.

raise notice 'end block level 1';

end; -- end block level 1

$$ language plpgsql;

测试：

在block 1出错，出错代码后面的代码都不会被执行。

postgres=# select ft('block1');

NOTICE: block level 1

NOTICE: end block level 1

----

(1 row)

在block2.1出错，block 2.1内部出错代码后面的代码都不会被执行。但是同级代码如2.2会被执行。

postgres=# select ft('block2.1');

NOTICE: block level 1

NOTICE: block level 2.1

NOTICE: end block level 2.1

NOTICE: block level 2.2

----

(1 row)

在block2.2出错。

postgres=# select ft('block2.2');

NOTICE: block level 1

NOTICE: block level 2.1

NOTICE: block level 3.1

NOTICE: block level 2.2

NOTICE: end block level 2.2

----

(1 row)

在block3.1出错。

postgres=# select ft('block3.1');

NOTICE: block level 1

NOTICE: block level 2.1

NOTICE: block level 3.1

NOTICE: end block level 3.1

NOTICE: block level 2.2

----

(1 row)

比例子2更直观的例子：

drop table tt;

create table tt(id int primary key, info text);

insert into tt values(5,'test');

create or replace function ft() returns void as $$

declare

begin -- block level 1

begin -- block level 2.1

insert into tt values (1,'test'),(2,'test'),(3,'test');

exception when others then

raise notice 'rollback block level 2.1';

end; -- end block level 2.1

begin -- block level 2.2

insert into tt values (4,'test'),(5,'test'),(6,'test'); -- 主键冲突, 插入失败, 但是不影响后面的block继续执行.

exception when others then

raise notice 'rollback block level 2.2';

end; -- end block level 2.2

begin -- block level 2.3

insert into tt values (7,'test'),(8,'test'),(9,'test');

exception when others then

raise notice 'rollback block level 2.3';

end; -- end block level 2.3

exception when others then -- you can write catchup any ERROR CODE or ERROR STATE.

raise notice 'rollback block level 1';

end; -- end block level 1

$$ language plpgsql;

postgres=# select ft();

NOTICE: rollback block level 2.2

----

(1 row)

postgres=# select * from tt;

id | info

----+------

5 | test

1 | test

2 | test

3 | test

7 | test

8 | test

9 | test

(7 rows)

例子3：
使用dblink，同样需要将需要批量提交的部分写成子函数先。
例如：

create extension dblink;
CREATE SERVER fdtest FOREIGN DATA WRAPPER dblink_fdw OPTIONS (hostaddr '127.0.0.1', dbname '函数所在的库名');
CREATE USER MAPPING FOR 需要调用函数的用户名 SERVER fdtest OPTIONS (user '需要调用函数的用户名', password '用户密码');
GRANT USAGE ON FOREIGN SERVER fdtest TO 需要调用函数的用户名;

函数体

declare
  dblink_block_res1 record;
  dblink_block_res2 record;
...
  dblink_block_resn record;
...
  其他变量定义;
begin
-- 建立连接
if ( dblink_connect('myconn', 'fdtest') <> 'OK' ) then
  raise notice '连接失败';
  return;
end if;
-- block 1，需要流转的变量通过参数传入下面的函数
  select dblink('myconn', 'select func1($1,$2,...)') into dblink_block_res1;  -- $1,$2,...使用常数替代, 或动态SQL
-- 中间结果判断
  if not found then  -- dblink调用异常
    -- 异常处理
  else
    -- dblink对应子事务已提交
  end if;
-- block 2，需要流转的变量通过参数传入下面的函数
  select dblink('myconn', 'select func2($1,$2,...)') into dblink_block_res2;  -- $1,$2,...使用常数替代, 或动态SQL
-- 中间结果判断
  if not found then  -- 远程调用异常
    -- 异常处理
  else
    -- dblink对应子事务已提交
  end if;
......
-- block n，需要流转的变量通过参数传入下面的函数
  select dblink('myconn', 'select funcn($1,$2,...)') into dblink_block_resn;  -- $1,$2,...使用常数替代, 或动态SQL
-- 中间结果判断
  if not found then  -- 远程调用异常
    -- 异常处理
  else
    -- dblink对应子事务已提交
  end if;
......
exception when others then
...
end;

其他参考地址:
http://www.postgresql.org/docs/9.5/static/contrib-dblink-function.html
http://postgresql.nabble.com/Autonomous-Transaction-WIP-td5798928.html
https://lwn.net/Articles/648973/

↧

PostgreSQL Oracle 兼容性之 - WM_SYS.WM_CONCAT

February 4, 2016, 12:01 am

≫ Next: PostgreSQL wal receiver 统计信息 patch

≪ Previous: PostgreSQL Oracle兼容性之 - 函数自治事务的实现

Oracle行转列函数WMSYS.WM_CONCAT的使用实例demo
select * from itlife365_course a where name= '张三';
name 课程 score
张三数学 99
张三语文 89
张三英语 93

上面的场景可用WMSYS.WM_CONCAT(a.name)把二行中的[课程]字段的值用","连接起来

如：

select name, to_char(WMSYS.WM_CONCAT(a.课程))
  from itlife365_course a
 where name= '张三'
 group by a.name;

注意：因为用WMSYS.WM_CONCAT转出的类型是clob的，所以我这用了to_char转了一下。
使用wmsys.wm_concat多列合成一列遇到问题
ORA-22813: 操作数值超出系统的限制
官方文档解释是总长度超过30k
请使用其他方法替代。

PostgreSQL使用string_agg聚合函数即可达到同样的目的：
select name, string_agg(a.课程, ',')
  from itlife365_course a
 where name= '张三'
 group by a.name;

如果用户不想改代码，可以尝试自行创建一个名为WM_CONCAT的聚合函数，例子如下：

create schema WMSYS;
create or replace function WMSYS.sf_concat(text,text) returns text as $$
  select case when $1 is not null then $1||','||$2 else $2 end;
$$ language sql called on null input;
create AGGREGATE WMSYS.wm_concat (text) (sfunc=WMSYS.sf_concat,stype=text);

测试:

postgres=# select reltype,wmsys.wm_concat(relname) from pg_class group by reltype order by reltype;
 reltype |                                                                                                                                                                                                                                                                                                                                                                                             wm_concat                                                             

---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
------
       0 | pg_oid_16388_index,pg_toast_2619_index,pg_authid_rolname_index,pg_attribute_relid_attnam_index,pg_attribute_relid_attnum_index,pg_toast_1255_index,ha_health_check_pkey,pg_toast_2606_index,pg_am_name_index,pg_am_oid_index,pg_am
op_fam_strat_index,pg_amop_opr_fam_index,pg_amop_oid_index,pg_amproc_fam_proc_index,pg_amproc_oid_index,pg_aggregate_fnoid_index,pg_toast_2618_index,pg_toast_2620_index,pg_toast_2609_index,pg_cast_oid_index,pg_cast_source_target_index,pg
_toast_2615_index,pg_toast_2964_index,pg_auth_members_role_member_index,pg_auth_members_member_role_index,pg_toast_2396_index,pg_toast_3596_index,pg_collation_oid_index,pg_collation_name_enc_nsp_index,pg_toast_2893_index,pg_database_datn
ame_index,pg_database_oid_index,pg_proc_oid_index,pg_proc_proname_args_nsp_index,pg_inherits_parent_index,pg_inherits_relid_seqno_index,pg_index_indrelid_index,pg_index_indexrelid_index,pg_operator_oid_index,pg_operator_oprname_l_r_n_ind
ex,pg_opfamily_am_name_nsp_index,pg_opfamily_oid_index,pg_opclass_am_name_nsp_index,pg_opclass_oid_index,pg_language_name_index,pg_language_oid_index,pg_largeobject_metadata_oid_index,pg_rewrite_oid_index,pg_rewrite_rel_rulename_index,pg
_trigger_tgconstraint_index,pg_trigger_tgrelid_tgname_index,pg_trigger_oid_index,pg_event_trigger_evtname_index,pg_event_trigger_oid_index,pg_description_o_c_o_index,pg_enum_oid_index,pg_enum_typid_label_index,pg_enum_typid_sortorder_ind
ex,pg_namespace_nspname_index,pg_namespace_oid_index,pg_conversion_default_index,pg_conversion_name_nsp_index,pg_conversion_oid_index,pg_depend_depender_index,pg_depend_reference_index,pg_tablespace_oid_index,pg_tablespace_spcname_index,
pg_pltemplate_name_index,pg_shdepend_depender_index,pg_shdepend_reference_index,pg_shdescription_o_c_index,pg_ts_config_cfgname_index,pg_ts_config_oid_index,pg_oid_16417_index,pg_type_oid_index,pg_user_mapping_oid_index,pg_user_mapping_u
ser_server_index,pg_ts_config_map_index,pg_ts_dict_dictname_index,pg_ts_parser_prsname_index,pg_ts_parser_oid_index,pg_ts_template_tmplname_index,pg_ts_template_oid_index,pg_extension_oid_index,pg_extension_name_index,pg_foreign_data_wra
pper_oid_index,pg_foreign_data_wrapper_name_index,pg_foreign_server_oid_index,pg_foreign_server_name_index,pg_foreign_table_relid_index,pg_default_acl_role_nsp_obj_index,pg_default_acl_oid_index,pg_seclabel_object_index,pg_shseclabel_obj
ect_index,pg_range_rngtypid_index,pg_synonym_oid_index,pg_variable_oid_index,pg_variable_varname_pkg_index,edb_dir_oid_index,edb_dir_name_index,edb_policy_oid_index,edb_policy_object_name_index,edb_partdef_oid_index,edb_partdef_pdefrel_i
ndex,edb_partition_oid_index,edb_partition_pdefid_index,pg_toast_16417_index,pg_oid_16431_index,pg_toast_16431_index,pg_toast_12506_index,pg_toast_12511_index,pg_toast_12516_index,pg_toast_12521_index,pg_toast_12526_index,pg_toast_12531_
index,pg_toast_13390_index,pg_toast_13857_index,system_waits_pk,pg_toast_13864_index,session_waits_pk,pg_toast_13871_index,session_waits_hist_pk,edb$stat_idx_pk,edb$stat_tab_pk,edb$stat_db_pk,edb$statio_idx_pk,edb$statio_tab_pk,pg_authid
_oid_index,pg_statistic_relid_att_inh_index,pg_type_typname_nsp_index,pg_largeobject_loid_pn_index,pg_class_oid_index,pg_class_relname_nsp_index,pg_toast_2604_index,pg_attrdef_adrelid_adnum_index,pg_attrdef_oid_index,pg_constraint_connam
e_nsp_index,pg_constraint_conrelid_index,pg_constraint_contypid_index,pg_constraint_oid_index,pg_db_role_setting_databaseid_rol_index,pg_ts_dict_oid_index,pg_synonym_synname_nspoid_index,pg_toast_12501_index,edb_partition_partrelid_index
,pg_toast_13397_index,pg_toast_13383_index,plsql_profiler_runs_pkey,pg_toast_13850_index,snap_pk
...