Quantcast
Channel: PostgreSQL research
Viewing all 253 articles
Browse latest View live

PostgreSQL wal receiver 统计信息 patch

$
0
0
之前写过一篇文档,关于如何编写一个C函数,在PostgreSQL hot standby中获取wal receiver的统计信息,以及上游节点的连接信息(conninfo)。
PostgreSQL 9.6 把这个功能合到内核了。
patch地址如下:

主要新增代码:
为了保证取出状态的一致性,需要对&walrcv->mutex加锁。

+
+/*
+ * Return a string constant representing the state. This is used
+ * in system functions and views, and should *not* be translated.
+ */
+static const char *
+WalRcvGetStateString(WalRcvState state)
+{
+   switch (state)
+   {
+       case WALRCV_STOPPED:
+           return "stopped";
+       case WALRCV_STARTING:
+           return "starting";
+       case WALRCV_STREAMING:
+           return "streaming";
+       case WALRCV_WAITING:
+           return "waiting";
+       case WALRCV_RESTARTING:
+           return "restarting";
+       case WALRCV_STOPPING:
+           return "stopping";
+   }
+   return "UNKNOWN";
+}
+
+/*
+ * Returns activity of WAL receiver, including pid, state and xlog locations
+ * received from the WAL sender of another server.
+ */
+Datum
+pg_stat_get_wal_receiver(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAL_RECEIVER_COLS  11
+   TupleDesc   tupdesc;
+   Datum       values[PG_STAT_GET_WAL_RECEIVER_COLS];
+   bool        nulls[PG_STAT_GET_WAL_RECEIVER_COLS];
+   WalRcvData *walrcv = WalRcv;
+   WalRcvState state;
+   XLogRecPtr  receive_start_lsn;
+   TimeLineID  receive_start_tli;
+   XLogRecPtr  received_lsn;
+   TimeLineID  received_tli;
+   TimestampTz last_send_time;
+   TimestampTz last_receipt_time;
+   XLogRecPtr  latest_end_lsn;
+   TimestampTz latest_end_time;
+   char       *slotname;
+
+   /* No WAL receiver, just return a tuple with NULL values */
+   if (walrcv->pid == 0)
+       PG_RETURN_NULL();
+
+   /* Initialise values and NULL flags arrays */
+   MemSet(values, 0, sizeof(values));
+   MemSet(nulls, 0, sizeof(nulls));
+
+   /* Initialise attributes information in the tuple descriptor */
+   tupdesc = CreateTemplateTupleDesc(PG_STAT_GET_WAL_RECEIVER_COLS, false);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 1, "pid",
+                      INT4OID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 2, "status",
+                      TEXTOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 3, "receive_start_lsn",
+                      LSNOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 4, "receive_start_tli",
+                      INT4OID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 5, "received_lsn",
+                      LSNOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 6, "received_tli",
+                      INT4OID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 7, "last_msg_send_time",
+                      TIMESTAMPTZOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 8, "last_msg_receipt_time",
+                      TIMESTAMPTZOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 9, "latest_end_lsn",
+                      LSNOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 10, "latest_end_time",
+                      TIMESTAMPTZOID, -1, 0);
+   TupleDescInitEntry(tupdesc, (AttrNumber) 11, "slot_name",
+                      TEXTOID, -1, 0);
+
+   BlessTupleDesc(tupdesc);
+
+   /* Take a lock to ensure value consistency */
+   SpinLockAcquire(&walrcv->mutex);
+   state = walrcv->walRcvState;
+   receive_start_lsn = walrcv->receiveStart;
+   receive_start_tli = walrcv->receiveStartTLI;
+   received_lsn = walrcv->receivedUpto;
+   received_tli = walrcv->receivedTLI;
+   last_send_time = walrcv->lastMsgSendTime;
+   last_receipt_time = walrcv->lastMsgReceiptTime;
+   latest_end_lsn = walrcv->latestWalEnd;
+   latest_end_time = walrcv->latestWalEndTime;
+   slotname = pstrdup(walrcv->slotname);
+   SpinLockRelease(&walrcv->mutex);
+
+   /* Fetch values */
+   values[0] = Int32GetDatum(walrcv->pid);
+
+   if (!superuser())
+   {
+       /*
+        * Only superusers can see details. Other users only get the pid
+        * value to know whether it is a WAL receiver, but no details.
+        */
+       MemSet(&nulls[1], true, PG_STAT_GET_WAL_RECEIVER_COLS - 1);
+   }
+   else
+   {
+       values[1] = CStringGetTextDatum(WalRcvGetStateString(state));
+
+       if (XLogRecPtrIsInvalid(receive_start_lsn))
+           nulls[2] = true;
+       else
+           values[2] = LSNGetDatum(receive_start_lsn);
+       values[3] = Int32GetDatum(receive_start_tli);
+       if (XLogRecPtrIsInvalid(received_lsn))
+           nulls[4] = true;
+       else
+           values[4] = LSNGetDatum(received_lsn);
+       values[5] = Int32GetDatum(received_tli);
+       if (last_send_time == 0)
+           nulls[6] = true;
+       else
+           values[6] = TimestampTzGetDatum(last_send_time);
+       if (last_receipt_time == 0)
+           nulls[7] = true;
+       else
+           values[7] = TimestampTzGetDatum(last_receipt_time);
+       if (XLogRecPtrIsInvalid(latest_end_lsn))
+           nulls[8] = true;
+       else
+           values[8] = LSNGetDatum(latest_end_lsn);
+       if (latest_end_time == 0)
+           nulls[9] = true;
+       else
+           values[9] = TimestampTzGetDatum(latest_end_time);
+       if (*slotname == '\0')
+           nulls[10] = true;
+       else
+           values[10] = CStringGetTextDatum(slotname);
+   }
+
+   /* Returns the record as Datum */
+   PG_RETURN_DATUM(HeapTupleGetDatum(
+                         heap_form_tuple(tupdesc, values, nulls)));
+}

用法参考:
可以获取到的信息,
wal receiver的PID。
当前的状态(停止,正在启动,正在接收,正在等待,重启中,停止中,未知)。
WAL RECEIVER进程启动时的WAL需接收位置receive_start_lsn,时间线receive_start_tli。
当前wal receiver已接收并flush到磁盘的WAL 位置received_lsn,时间线received_tli。
最后一次接收到 wal sender 发送过来的消息的 wal sender  端携带的发消息时的时间。last_msg_send_time
最后一次接收到 wal sender 发送过来的消息的 wal receiver 端的当前时间。last_msg_receipt_time
(last_msg_receipt_time - last_msg_send_time)就是网络延迟。(当然前提是两个服务器的时钟一致。)
最后一次feed back给wal sender的 wal 已接收位置 latest_end_lsn
最后一次feed back给wal sender的 wal 已接收时间戳 latest_end_time
上游节点的slot name。

Table 27-5. pg_stat_wal_receiver View

ColumnTypeDescription
pidintegerProcess ID of the WAL receiver process
statustextActivity status of the WAL receiver process
receive_start_lsnpg_lsnFirst transaction log position used when WAL receiver is started
receive_start_tliintegerFirst timeline number used when WAL receiver is started
received_lsnpg_lsnLast transaction log position already received and flushed to disk, the initial value of this field being the first log position used when WAL receiver is started
received_tliintegerTimeline number of last transaction log position received and flushed to disk, the initial value of this field being the timeline number of the first log position used when WAL receiver is started
last_msg_send_timetimestamp with time zoneSend time of last message received from origin WAL sender
last_msg_receipt_timetimestamp with time zoneReceipt time of last message received from origin WAL sender
latest_end_lsnpg_lsnLast transaction log position reported to origin WAL sender
latest_end_timetimestamp with time zoneTime of last transaction log position reported to origin WAL sender
slot_nametextReplication slot name used by this WAL receiver

The pg_stat_wal_receiver view will contain only one row, showing statistics about the WAL receiver from that receiver's connected server.


PostgreSQL pg_stat_ssl 视图 patch

$
0
0
PostgreSQL 9.6的patch,允许用户查看backend的连接信息,如果是SSL连接,输出SSL版本,cipher算法,加密比特位,是否压缩,DNS等信息。

Table 27-6. pg_stat_ssl View

ColumnTypeDescription
pidintegerProcess ID of a backend or WAL sender process
sslbooleanTrue if SSL is used on this connection
versiontextVersion of SSL in use, or NULL if SSL is not in use on this connection
ciphertextName of SSL cipher in use, or NULL if SSL is not in use on this connection
bitsintegerNumber of bits in the encryption algorithm used, or NULL if SSL is not used on this connection
compressionbooleanTrue if SSL compression is in use, false if not, or NULL if SSL is not in use on this connection
clientdntextDistinguished Name (DN) field from the client certificate used, or NULL if no client certificate was supplied or if SSL is not in use on this connection. This field is truncated if the DN field is longer than NAMEDATALEN (64 characters in a standard build)

The pg_stat_ssl view will contain one row per backend or WAL sender process, showing statistics about SSL usage on this connection. It can be joined to pg_stat_activity or pg_stat_replication on the pid column to get more details about the connection.

pgq 逻辑增量复制实际应用案例

$
0
0

详见 https://github.com/digoal/pgsql_admin_script/blob/master/pgq_case.md

pgq的实际应用案例, 在线增量复制的实施案例.

创建源库

postgres=# create database src;  
CREATE DATABASE

创建目标库

postgres=# create database dest;  
CREATE DATABASE

连接到源库

\c src

创建测试表
组1, 这两个表有外键关联, 在一个事务中操作, 在事务中的所有表的跟踪记录必须插入同一个记录表.

create table grp1_tbl1 (id int8 primary key, info text, crt_time timestamp) ;  
create table grp1_tbl2 ( id int8 primary key, tbl1_id int8 REFERENCES grp1_tbl1(id) DEFERRABLE INITIALLY DEFERRED, info text, crt_time timestamp );

组2, 这两个表有外键关联, 在一个事务中操作, 在事务中的所有表的跟踪记录必须插入同一个记录表.

create table grp2_tbl1 (id int8 primary key, info text, crt_time timestamp);  
create table grp2_tbl2 ( id int8 primary key, tbl1_id int8 REFERENCES grp2_tbl1(id) DEFERRABLE INITIALLY DEFERRED, info text, crt_time timestamp );

组3, 这两个表有外键关联, 在一个事务中操作, 在事务中的所有表的跟踪记录必须插入同一个记录表.

create table grp3_tbl1 (id int8 primary key, info text, crt_time timestamp);  
create table grp3_tbl2 ( id int8 primary key, tbl1_id int8 REFERENCES grp3_tbl1(id) DEFERRABLE INITIALLY DEFERRED, info text, crt_time timestamp );

创建pgbench测试脚本, 三组测试表分别使用3个事务, 每个事务包含对组内2个表的更新或插入操作2次, 删除操作1次.

vi test.sql  
\setrandom grp1_tbl1_id 1 1000000  
\setrandom grp1_tbl2_id 1 2000000  
\setrandom grp2_tbl1_id 1 1000000  
\setrandom grp2_tbl2_id 1 2000000  
\setrandom grp3_tbl1_id 1 1000000  
\setrandom grp3_tbl2_id 1 2000000  

begin;  
insert into grp1_tbl1 (id,info,crt_time) values (:grp1_tbl1_id, md5(random()::text), now()) on conflict ON CONSTRAINT grp1_tbl1_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp1_tbl2 (id,tbl1_id,info,crt_time) values (:grp1_tbl2_id, :grp1_tbl1_id, md5(random()::text), now()) on conflict ON CONSTRAINT grp1_tbl2_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp1_tbl1 (id,info,crt_time) values (:grp1_tbl1_id+1, md5(random()::text), now()) on conflict ON CONSTRAINT grp1_tbl1_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp1_tbl2 (id,tbl1_id,info,crt_time) values (:grp1_tbl2_id+1, :grp1_tbl1_id+1, md5(random()::text), now()) on conflict ON CONSTRAINT grp1_tbl2_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
delete from grp1_tbl2 where id = (:tbl2_id+100);  
end;  

begin;  
insert into grp2_tbl1 (id,info,crt_time) values (:grp2_tbl1_id, md5(random()::text), now()) on conflict ON CONSTRAINT grp2_tbl1_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp2_tbl2 (id,tbl1_id,info,crt_time) values (:grp2_tbl2_id, :grp2_tbl1_id, md5(random()::text), now()) on conflict ON CONSTRAINT grp2_tbl2_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp2_tbl1 (id,info,crt_time) values (:grp2_tbl1_id+1, md5(random()::text), now()) on conflict ON CONSTRAINT grp2_tbl1_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp2_tbl2 (id,tbl1_id,info,crt_time) values (:grp2_tbl2_id+1, :grp2_tbl1_id+1, md5(random()::text), now()) on conflict ON CONSTRAINT grp2_tbl2_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
delete from grp2_tbl2 where id = (:tbl2_id+100);  
end;  

begin;  
insert into grp3_tbl1 (id,info,crt_time) values (:grp3_tbl1_id, md5(random()::text), now()) on conflict ON CONSTRAINT grp3_tbl1_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp3_tbl2 (id,tbl1_id,info,crt_time) values (:grp3_tbl2_id, :grp3_tbl1_id, md5(random()::text), now()) on conflict ON CONSTRAINT grp3_tbl2_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp3_tbl1 (id,info,crt_time) values (:grp3_tbl1_id+1, md5(random()::text), now()) on conflict ON CONSTRAINT grp3_tbl1_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
insert into grp3_tbl2 (id,tbl1_id,info,crt_time) values (:grp3_tbl2_id+1, :grp3_tbl1_id+1, md5(random()::text), now()) on conflict ON CONSTRAINT grp3_tbl2_pkey do update set info=excluded.info,crt_time=excluded.crt_time;  
delete from grp3_tbl2 where id = (:tbl2_id+100);  
end;

生成一部分数据

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 20 src

创建hstore扩展

create extension hstore;

创建mq schema

CREATE SCHEMA IF NOT EXISTS mq;

创建获取事务结束时间的函数

create or replace function mq.get_commit_time() returns timestamp without time zone as $$  
declare  
  res timestamp without time zone;  
begin  
  show commit_time.realval into res;  
  return res;  
exception when others then  -- 如果未设置, 则使用以下SQL设置.  
  res := clock_timestamp();  
  execute 'set local commit_time.realval = '''||res||'''';  -- 设置事务级变量  
  return res;  
end;  
$$ language plpgsql;

创建三组跟踪记录表, 实际生产中可以根据需要创建多组记录表, 多组记录表的好处是, 不同的记录表, 在目标端可以并行回放.
在同一个事务要操作多个表的话, 这些表必须的跟踪记录必须记录到同一个记录表, 回放时以达到事务一致性.

第1组跟踪记录表

第1组跟踪记录表对应的触发器函数

第2组跟踪记录表

第2组跟踪记录表对应的触发器函数

第3组跟踪记录表

第3组跟踪记录表对应的触发器函数

为第1组测试表创建触发器函数

CREATE CONSTRAINT TRIGGER tg AFTER INSERT OR DELETE OR UPDATE ON grp1_tbl1 DEFERRABLE INITIALLY DEFERRED FOR EACH ROW EXECUTE PROCEDURE mq.dml_trace_grp1();  
CREATE CONSTRAINT TRIGGER tg AFTER INSERT OR DELETE OR UPDATE ON grp1_tbl2 DEFERRABLE INITIALLY DEFERRED FOR EACH ROW EXECUTE PROCEDURE mq.dml_trace_grp1();

为第2组测试表创建触发器函数

CREATE CONSTRAINT TRIGGER tg AFTER INSERT OR DELETE OR UPDATE ON grp2_tbl1 DEFERRABLE INITIALLY DEFERRED FOR EACH ROW EXECUTE PROCEDURE mq.dml_trace_grp2();  
CREATE CONSTRAINT TRIGGER tg AFTER INSERT OR DELETE OR UPDATE ON grp2_tbl2 DEFERRABLE INITIALLY DEFERRED FOR EACH ROW EXECUTE PROCEDURE mq.dml_trace_grp2();

为第3组测试表创建触发器函数

CREATE CONSTRAINT TRIGGER tg AFTER INSERT OR DELETE OR UPDATE ON grp3_tbl1 DEFERRABLE INITIALLY DEFERRED FOR EACH ROW EXECUTE PROCEDURE mq.dml_trace_grp3();  
CREATE CONSTRAINT TRIGGER tg AFTER INSERT OR DELETE OR UPDATE ON grp3_tbl2 DEFERRABLE INITIALLY DEFERRED FOR EACH ROW EXECUTE PROCEDURE mq.dml_trace_grp3();

创建组1消费函数

create or replace function mq.build_sql_grp1(n int) returns setof text as $$  
declare  
  m int := 0;  
  v_table_change_rec_grp1 mq.table_change_rec_grp1;  
  v_tablename name;  
  v_crt_time timestamp without time zone;  
  curs1 refcursor;  
  v_sql text := '';  
  v_cols text := '';  
  v_vals text := '';  
  v_upd_set text := '';  
  v_upd_del_where text :='';  
  v_x_id int8;  
  v_max_crt_time timestamp without time zone;  
begin  
  if n <=0 then  
    -- raise notice 'n must be > 0.';  
    return;  
  end if;  

  return next 'BEGIN;';  

  -- 取一个最小的队列表  
  select tablename,crt_time into v_tablename,v_crt_time from   
  (  
  select 'table_change_rec_grp1_0' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_0 where consumed=false  
    union all  
  select 'table_change_rec_grp1_1' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_1 where consumed=false  
    union all  
  select 'table_change_rec_grp1_2' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_2 where consumed=false  
    union all  
  select 'table_change_rec_grp1_3' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_3 where consumed=false  
    union all  
  select 'table_change_rec_grp1_4' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_4 where consumed=false  
    union all  
  select 'table_change_rec_grp1_5' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_5 where consumed=false  
    union all  
  select 'table_change_rec_grp1_6' as tablename,min(crt_time) as crt_time from mq.table_change_rec_grp1_6 where consumed=false  
  ) t   
  order by crt_time limit 1;  

case v_tablename  

when 'table_change_rec_grp1_0' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_0 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_0 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_0 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_0 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_0 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_0 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_0 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_0 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  


when 'table_change_rec_grp1_1' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_1 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_1 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_1 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_1 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_1 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_1 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_1 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_1 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  


when 'table_change_rec_grp1_2' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_2 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_2 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_2 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_2 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_2 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_2 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_2 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_2 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  


when 'table_change_rec_grp1_3' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_3 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_3 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_3 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_3 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_3 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_3 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_3 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_3 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  


when 'table_change_rec_grp1_4' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_4 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_4 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_4 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_4 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_4 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_4 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_4 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_4 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  


when 'table_change_rec_grp1_5' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_5 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_5 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_5 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_5 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_5 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_5 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_5 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_5 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  


when 'table_change_rec_grp1_6' then  
  -- 获取提交时间( 每个事务的结束时间获取原理, 通过延迟触发器, 在事务结束时触发行触发器, 通过mq.get_commit_time()函数获取时间, 可以确保事务内所有row的时间戳一致. )
  -- 回放顺序, 和事务提交顺序一致. 最小原子单位为事务.
  -- 单个事务包含多个SQL时, 可以通过command id来区分先后顺序, 或者通过序列来区分先后顺序.
  -- 多个事务同一时刻提交, 如果时间戳一致, 如果每个事务都包含多ROW, 则可能会混合顺序执行. 批量回放时合并成一个事务回放, 不影响一致性. 单一事务回放时, 随机选取哪个事务先执行.
  if n=1 then  
    select x_id into v_x_id from mq.table_change_rec_grp1_6 where consumed=false order by crt_time,id limit 1;  
    open curs1 for select * from mq.table_change_rec_grp1_6 where consumed=false and x_id=v_x_id order by crt_time,id for update;  
  else  
    select crt_time into v_crt_time from mq.table_change_rec_grp1_6 where consumed=false order by crt_time,id limit 1 offset n-1;  
    if found then
      open curs1 for select * from mq.table_change_rec_grp1_6 where consumed=false and crt_time<=v_crt_time order by crt_time,id for update;  
    else  
      -- n超出所剩跟踪记录  
      open curs1 for select * from mq.table_change_rec_grp1_6 where consumed=false order by crt_time,id for update;  
    end if;  
  end if;  

fetch curs1 into v_table_change_rec_grp1;  
LOOP  
if found then  
-- raise notice '%', v_table_change_rec_grp1;  
-- build sql  
-- case tg insert,update,delete,ddl  
-- quote_ident 封装schema,tablename,column  
-- quote_nullable 封装value  
-- 不带主键的表, 如果有重复行, 使用ctid在源端操作单行, 会导致目标端不一致(避免使用ctid, 或者强制要求主键或非空唯一)  
case v_table_change_rec_grp1.op  
when 'INSERT' then  
-- 组装COLUMNS, VALUES  
v_cols := '' ;  
v_vals := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_cols := v_cols || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || ',' ;  
  v_vals := v_vals || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
end loop;  
v_cols := rtrim(v_cols, ',') ;  
v_vals := rtrim(v_vals, ',') ;  

-- 组装SQL  
v_sql := 'insert into '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||'('||v_cols||')'||' values('||v_vals||');' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_6 set consumed=true where current of curs1;  
return next v_sql;  

when 'UPDATE' then  
-- 组装COLUMNS, VALUES  
v_upd_set := '' ;  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.new_rec),1) loop  
  v_upd_set := v_upd_set || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.new_rec))[i][2]) || ',' ;  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_set := rtrim(v_upd_set, ',') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'update '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' set '||v_upd_set||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_6 set consumed=true where current of curs1;  
return next v_sql;  

when 'DELETE' then  
-- 组装COLUMNS, VALUES  
v_upd_del_where := '' ;  
for i in 1..array_length(hstore_to_matrix(v_table_change_rec_grp1.old_rec),1) loop  
  if quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) = 'NULL' then  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || ' is null ' || ' and';  
  else  
    v_upd_del_where := v_upd_del_where || ' ' || quote_ident((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][1]) || '=' || quote_nullable((hstore_to_matrix(v_table_change_rec_grp1.old_rec))[i][2]) || ' and';  
  end if;  
end loop;  

v_upd_del_where := rtrim(v_upd_del_where, 'd') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'n') ;  
v_upd_del_where := rtrim(v_upd_del_where, 'a') ;  

-- 组装SQL  
v_sql := 'delete from '||quote_ident(v_table_change_rec_grp1.table_schema)||'.'||quote_ident(v_table_change_rec_grp1.table_name)||' where '|| v_upd_del_where ||';' ;  
-- raise notice '%', v_sql;  
update mq.table_change_rec_grp1_6 set consumed=true where current of curs1;  
return next v_sql;  

else  
  -- raise notice 'I do not known how to deal this op: %', v_table_change_rec_grp1.op;  
end case;  

else  
close curs1;  
return next 'END;';  
return;  
end if;  
fetch curs1 into v_table_change_rec_grp1;  
END LOOP;  

else  
  -- raise notice 'no % queue table deal code in this function.', v_tablename;  
  return;  

end case;  

end;  
$$ language plpgsql strict ;

创建组2消费函数

创建组3消费函数

验证消息队列取数据的事务一致性

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 10 src

psql src

select mq.build_sql_grp1(1);  -- min(xid)=max(xid) 取1个xid  
 BEGIN;  
 update public.grp1_tbl1 set id='131056',info='c0d5e77b1a25e9895579d54abf5a1fe1',crt_time='2016-02-10 22:23:25.327334' where  id='131056' and info='65f46b8b12e5cdd35f1d95d51dbe0d96' and crt_time='2016-02-10 22:05:11.067228' ;  
 update public.grp1_tbl2 set id='1487543',info='988bddbd620b6ebae7bb12ff1498be09',tbl1_id='235631',crt_time='2016-02-10 22:23:25.327334' where  id='1487543' and info='4dcdeddd73928541f3f3991a5bb92239' and tbl1_id='235631' and crt_time='2  
016-02-10 21:36:37.314308' ;  
 update public.grp1_tbl1 set id='131057',info='8a7f1010881b8c6593022d11635219d9',crt_time='2016-02-10 22:23:25.327334' where  id='131057' and info='c799ebdd5cc46fa8e889c5e7541d7d00' and crt_time='2016-02-10 22:04:57.90855' ;  
 insert into public.grp1_tbl2(id,info,tbl1_id,crt_time) values('1487544','c519842ff4a0d6e8e7369e23bddef451','131057','2016-02-10 22:23:25.327334');  
 END;  

select mq.build_sql_grp1(4);  -- min(xid)=max(xid) 取1个xid  
 BEGIN;  
 update public.grp1_tbl1 set id='548793',info='32e70c3483fe65c4cf178e14e7ff4c28',crt_time='2016-02-10 22:23:25.327503' where  id='548793' and info='b68e03e7219b05e8f0b721772a665da8' and crt_time='2016-02-10 21:36:40.8851' ;  
 update public.grp1_tbl2 set id='665971',info='e2163b17b141901cd755436393b065f6',tbl1_id='328299',crt_time='2016-02-10 22:23:25.327503' where  id='665971' and info='911f8c54164052aa756a2d0ddcff5303' and tbl1_id='328299' and crt_time='201  
6-02-10 22:04:09.349097' ;  
 update public.grp1_tbl1 set id='548794',info='92c9e6cf5bff1c0f8bb343045a99aca5',crt_time='2016-02-10 22:23:25.327503' where  id='548794' and info='09b25e5bf65703fc8036b9c11fc851c5' and crt_time='2016-02-10 21:36:40.8851' ;  
 update public.grp1_tbl2 set id='665972',info='fd988478cc4a2ef639231cdb8c12bd87',tbl1_id='328300',crt_time='2016-02-10 22:23:25.327503' where  id='665972' and info='9c536cd4a631ab1bf819c2ab28351f4a' and tbl1_id='328300' and crt_time='201  
6-02-10 22:04:09.349097' ;  
 END;  

select mq.build_sql_grp1(5);  -- min(xid) <> max(xid) 不取最后一个xid  
 BEGIN;  
 update public.grp1_tbl1 set id='89941',info='41da03886622b5617ca640804edec45a',crt_time='2016-02-10 22:23:25.327111' where  id='89941' and info='d2f3972c3a254cc740d48ff10521fc34' and crt_time='2016-02-10 22:10:41.274318' ;  
 update public.grp1_tbl2 set id='930464',info='35d7e72cab05a5b77361b98605ce8a1f',tbl1_id='655832',crt_time='2016-02-10 22:23:25.327111' where  id='930464' and info='ee54ab83230c477bf504cc9db772adbe' and tbl1_id='655832' and crt_time='201  
6-02-10 21:36:21.791644' ;  
 update public.grp1_tbl1 set id='89942',info='70cdce347ab4c7e4fe9e07548e7031ca',crt_time='2016-02-10 22:23:25.327111' where  id='89942' and info='403cff6b29b47adaa3177051f2503d86' and crt_time='2016-02-10 22:10:41.274318' ;  
 update public.grp1_tbl2 set id='930465',info='4d5a312fe01ae145c6eeea4f3e5af046',tbl1_id='68321',crt_time='2016-02-10 22:23:25.327111' where  id='930465' and info='61d352aa53dbb00b04bc036130daaff2' and tbl1_id='68321' and crt_time='2016-  
02-10 21:36:21.791644' ;  
 END;  

select mq.build_sql_grp1(13);  -- min(xid) <> max(xid) 不取最后一个xid  
 BEGIN;  
 update public.grp1_tbl1 set id='782912',info='c7909a2201657277731746397d237ef9',crt_time='2016-02-10 22:23:25.327133' where  id='782912' and info='d783a9796860b1316411f5b258051858' and crt_time='2016-02-10 21:24:44.640954' ;  
 update public.grp1_tbl1 set id='379581',info='f72224a3ba4cb557ea49994b16a3138b',crt_time='2016-02-10 22:23:25.327133' where  id='379581' and info='7a7b1817f4aac88af359ba594ddb5924' and crt_time='2016-02-10 21:36:25.240159' ;  
 update public.grp1_tbl2 set id='647936',info='ffb15cdf80106bb3593adf2ee60ccf30',tbl1_id='77106',crt_time='2016-02-10 22:23:25.327133' where  id='647936' and info='8599d26d938651dfcfde78b4f86ffb9d' and tbl1_id='77106' and crt_time='2016-  
02-10 21:35:39.811541' ;  
 insert into public.grp1_tbl2(id,info,tbl1_id,crt_time) values('1310498','57c717a8be1aaf2f45eaad8826233e83','782912','2016-02-10 22:23:25.327133');  
 insert into public.grp1_tbl1(id,info,crt_time) values('379582','d7ed641cee4485a94a6496a18de86c44','2016-02-10 22:23:25.327133');  
 update public.grp1_tbl1 set id='782913',info='d02f8d22d0f6c24d58ab13df5b3c9fba',crt_time='2016-02-10 22:23:25.327133' where  id='782913' and info='5d6844b7d2dc5ec3143c932eae2ad36e' and crt_time='2016-02-10 21:24:44.640954' ;  
 update public.grp1_tbl2 set id='647937',info='2e43cfcd9abf35d7e35ba94130555f61',tbl1_id='77107',crt_time='2016-02-10 22:23:25.327133' where  id='647937' and info='60f4e42343aa30120afa3033c1905ed2' and tbl1_id='77107' and crt_time='2016-  
02-10 21:35:39.811541' ;  
 update public.grp1_tbl2 set id='1310499',info='496e0a9ecc1a5d49592005eafdd5e68f',tbl1_id='106818',crt_time='2016-02-10 22:23:25.327133' where  id='1310499' and info='aca3cb5217031c58db8640ec376e66d6' and tbl1_id='106818' and crt_time='2  
016-02-10 22:05:16.69726' ;  
 update public.grp1_tbl1 set id='411867',info='17e6c6b8251f25e19c8cd234bceb1e06',crt_time='2016-02-10 22:23:25.327571' where  id='411867' and info='99ae15ee891be0d2561d7068d1502570' and crt_time='2016-02-10 22:04:28.532467' ;  
 update public.grp1_tbl2 set id='1358340',info='6975e14b065d06c78166fd84bd413919',tbl1_id='443296',crt_time='2016-02-10 22:23:25.327571' where  id='1358340' and info='a7e98e8f6edc4c98b2a933e9dcfb1ee8' and tbl1_id='443296' and crt_time='2  
016-02-10 22:04:30.559512' ;  
 update public.grp1_tbl1 set id='411868',info='4945ec0218c9c85406ab1e14dda51588',crt_time='2016-02-10 22:23:25.327571' where  id='411868' and info='aa89bbdd62ee5821976ee3bffd8866ad' and crt_time='2016-02-10 22:10:39.064562' ;  
 update public.grp1_tbl2 set id='1358341',info='4d99962abf04a45a85f6f729ff63cd25',tbl1_id='443297',crt_time='2016-02-10 22:23:25.327571' where  id='1358341' and info='6a29504b044349981c1309842cd8cb8a' and tbl1_id='443297' and crt_time='2  
016-02-10 22:04:30.559512' ;  
 END;

连接到目标库, 创建复制表的结构.

\c dest

组1, 这两个表有外键关联, 在一个事务中操作, 在事务中的所有表的跟踪记录必须插入同一个记录表.

create table grp1_tbl1 (id int8 primary key, info text, crt_time timestamp);  
create table grp1_tbl2 ( id int8 primary key, tbl1_id int8 REFERENCES grp1_tbl1(id) DEFERRABLE INITIALLY DEFERRED, info text, crt_time timestamp );

组2, 这两个表有外键关联, 在一个事务中操作, 在事务中的所有表的跟踪记录必须插入同一个记录表.

create table grp2_tbl1 (id int8 primary key, info text, crt_time timestamp);  
create table grp2_tbl2 ( id int8 primary key, tbl1_id int8 REFERENCES grp2_tbl1(id) DEFERRABLE INITIALLY DEFERRED, info text, crt_time timestamp );

组3, 这两个表有外键关联, 在一个事务中操作, 在事务中的所有表的跟踪记录必须插入同一个记录表.

create table grp3_tbl1 (id int8 primary key, info text, crt_time timestamp);  
create table grp3_tbl2 ( id int8 primary key, tbl1_id int8 REFERENCES grp3_tbl1(id) DEFERRABLE INITIALLY DEFERRED, info text, crt_time timestamp );

例子1, 暴力同步 :
开始压测

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 100000 src

触发器都创建好之后, 就可以导出数据了.
压测的同时, 将数据dump出来, 恢复到dest

pg_dump -F p -a -t grp1_tbl1 -t grp1_tbl2 -t grp2_tbl1 -t grp2_tbl2 -t grp3_tbl1 -t grp3_tbl2 -x src | psql dest -f -  

COPY 158048  
COPY 164730  
COPY 158068  
COPY 165006  
COPY 158147  
COPY 164808

继续压测不要停

开始增量恢复, 首先使用单个事务复制的方式, 跳过重复部分(因为消费函数不管目标是否执行成功, 只要数据被取出即更新consumed=true)

while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp1(1)) to stdout;commit;' | psql dest -f - >/dev/null ; done  
while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp2(1)) to stdout;commit;' | psql dest -f - >/dev/null ; done  
while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp3(1)) to stdout;commit;' | psql dest -f - >/dev/null ; done

确认跳过重复部分后, 使用批量增量复制

while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp1(1000)) to stdout;commit;' | psql dest -f - >/dev/null ; done  
while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp2(1000)) to stdout;commit;' | psql dest -f - >/dev/null ; done  
while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp3(1000)) to stdout;commit;' | psql dest -f - >/dev/null ; done

停止压测, 等待增量同步完成

校验数据

psql src  
select sum(hashtext(t.*::text)) from grp1_tbl1 t;  
-163788004315  
select sum(hashtext(t.*::text)) from grp1_tbl2 t;  
311855736266  
select sum(hashtext(t.*::text)) from grp2_tbl1 t;  
-1605268316207  
select sum(hashtext(t.*::text)) from grp2_tbl2 t;  
-136992258088  
select sum(hashtext(t.*::text)) from grp3_tbl1 t;  
2375761278075  
select sum(hashtext(t.*::text)) from grp3_tbl2 t;  
-388257824197  

psql dest  
select sum(hashtext(t.*::text)) from grp1_tbl1 t;  
select sum(hashtext(t.*::text)) from grp1_tbl2 t;  
select sum(hashtext(t.*::text)) from grp2_tbl1 t;  
select sum(hashtext(t.*::text)) from grp2_tbl2 t;  
select sum(hashtext(t.*::text)) from grp3_tbl1 t;  
select sum(hashtext(t.*::text)) from grp3_tbl2 t;

结果一致

例子2, 可以用于跳过重复的温柔例子, 利用快照功能导出 :
(先清除dest端的数据)

开始压测

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 100000 src

连接到源库, 创建一个快照, 记录当前的事务状态, 不要退出事务

psql src  
src=# begin transaction isolation level repeatable read ;  
BEGIN  
src=# select txid_current_snapshot();  
               txid_current_snapshot      
----------------------------------------------------------------------  
 31004443:31004517:31004443,31004446,31004449,31004457,31004466,31004469,31004480,31004487,31004489,31004493,31004495,31004498,31004500,31004501,31004502,31004503,31004505,31004507,31004508,31004509,31004510,31004511,31004512,31004513,31004514,31004515  
(1 row)

最小未提交事务:最小未分配事务:未提交事务(s)

src=# select pg_export_snapshot();  
 pg_export_snapshot   
--------------------  
 01DA7E30-1  
(1 row)

使用这个快照, 将数据dump出来, 恢复到dest

pg_dump --snapshot=01DA7E30-1 -F p -a -t grp1_tbl1 -t grp1_tbl2 -t grp2_tbl1 -t grp2_tbl2 -t grp3_tbl1 -t grp3_tbl2 -x src | psql dest -f -  

COPY 678854  
COPY 865425  
COPY 679293  
COPY 866652  
COPY 678734  
COPY 865728

结束快照事务

src=# end;  
COMMIT

继续压测不要停

开始增量恢复, 首先务必等待确认pg_dump时的未提交事务已提交

postgres=# select * from txid_snapshot_xip(txid_current_snapshot()) t(xid) where t.xid in (31004443,31004446,31004449,31004457,31004466,31004469,31004480,31004487,31004489,31004493,31004495,31004498,31004500,31004501,31004502,31004503,31004505,31004507,31004508,31004509,31004510,31004511,31004512,31004513,31004514,31004515);  
 xid   
-----  
(0 rows)

清除不需要恢复的跟踪记录

psql src  
src=# update mq.table_change_rec_grp1 set consumed =true where consumed=false and (x_id<31004443 or (x_id>=31004443 and x_id<31004517 and x_id not in (31004443,31004446,31004449,31004457,31004466,31004469,31004480,31004487,31004489,31004493,31004495,31004498,31004500,31004501,31004502,31004503,31004505,31004507,31004508,31004509,31004510,31004511,31004512,31004513,31004514,31004515)));  
UPDATE 699488  
src=# update mq.table_change_rec_grp2 set consumed =true where consumed=false and (x_id<31004443 or (x_id>=31004443 and x_id<31004517 and x_id not in (31004443,31004446,31004449,31004457,31004466,31004469,31004480,31004487,31004489,31004493,31004495,31004498,31004500,31004501,31004502,31004503,31004505,31004507,31004508,31004509,31004510,31004511,31004512,31004513,31004514,31004515)));  
UPDATE 699404  
src=# update mq.table_change_rec_grp3 set consumed =true where consumed=false and (x_id<31004443 or (x_id>=31004443 and x_id<31004517 and x_id not in (31004443,31004446,31004449,31004457,31004466,31004469,31004480,31004487,31004489,31004493,31004495,31004498,31004500,31004501,31004502,31004503,31004505,31004507,31004508,31004509,31004510,31004511,31004512,31004513,31004514,31004515)));  
UPDATE 699328

批量增量复制

while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp1(1000)) to stdout;commit;' | psql dest -f - >/dev/null ; done  
while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp2(1000)) to stdout;commit;' | psql dest -f - >/dev/null ; done  
while true; do psql src -q -A -n -t -c 'begin work isolation level repeatable read; copy (select mq.build_sql_grp3(1000)) to stdout;commit;' | psql dest -f - >/dev/null ; done

停止压测, 等待增量同步完成

校验数据

psql src  
select sum(hashtext(t.*::text)) from grp1_tbl1 t;  
566782435274  
select sum(hashtext(t.*::text)) from grp1_tbl2 t;  
119298584431  
select sum(hashtext(t.*::text)) from grp2_tbl1 t;  
-794442717174  
select sum(hashtext(t.*::text)) from grp2_tbl2 t;  
-390984534106  
select sum(hashtext(t.*::text)) from grp3_tbl1 t;  
2937942086023  
select sum(hashtext(t.*::text)) from grp3_tbl2 t;  
302638200204  

psql dest  
select sum(hashtext(t.*::text)) from grp1_tbl1 t;  
select sum(hashtext(t.*::text)) from grp1_tbl2 t;  
select sum(hashtext(t.*::text)) from grp2_tbl1 t;  
select sum(hashtext(t.*::text)) from grp2_tbl2 t;  
select sum(hashtext(t.*::text)) from grp3_tbl1 t;  
select sum(hashtext(t.*::text)) from grp3_tbl2 t;

结果一致

Greenplum 用gplink (a java transform) 访问外部数据源

$
0
0
源码:
示例中包含了oracle和sql server的外部数据源(a query, 例如select * from table;)

原理:
greenplum 支持gpfdist协议外部表,gpfdist协议支持自定义transform。
gplink 使用jdbc连接外部数据源,定义transform,将jdbc数据源的数据转换为text格式导入GP或HAWQ。

Greenplum gplink (a java transform) 访问外部数据源 - 德哥@Digoal - PostgreSQL research
 
用法
README file for GPLink
########################################################################################
Site: http://www.PivotalGuru.com
Author: Jon Roberts
Email: jgronline@gmail.com
########################################################################################
GPLink links JDBC connections to Greenplum and Hawq External Tables.

Data is automatically cleansed for embedded carriage returns, newline, and/or null
characters.  Escape characters are retained by double escaping and embedded pipes
are retained by escaping. 

########################################################################################
#Installation:
########################################################################################
- gplink must be installed on a server that is accessible by all nodes of Greenplum
or Hawq.  A dedicated ETL server or the standby master are good candidates for 
hosting gplink.

1.  Download latest version from PivotalGuru.com
2.  Unzip <version>.zip
3.  source gplink_path.sh and add this to your .bashrc file
4.  Edit gplink.properties with correct Greenplum or Hawq connection information
5.  Download 3rd party JDBC drivers and place it in $GPLINK_HOME/jar
6.  Define source configurations in $GPLINK_HOME/connections/ 
7.  Define external table names and columns in $GPLINK_HOME/tables/
8.  Define SQL statements to execute in the source in $GPLINK_HOME/sql/
9.  Create the External Table with gpltable

########################################################################################
#Creating External Tables
########################################################################################
gpltable -s <source_config> -t <target_config> -f <sql> -a <source_table>
example:
gpltable -s sqlserver.properties -t $GPLINK_HOME/gplink.properties -f example.sql -a $GPLINK_HOME/tables/public.test.sql

########################################################################################
#Dropping External Tables
########################################################################################
gpldrop -t <target_config> -n <table_name>
example:
gpldrop -t $GPLINK_HOME/gplink.properties -n public.test

########################################################################################
#Start the gpfdist processes
########################################################################################
gplstart -t <target_config>
example:
gplstart -t $GPLINK_HOME/gplink.properties

Note: this is useful when the host is restarted and you need to start all of the gpfdist
processes needed by gplink External Tables.

########################################################################################
#Debugging
########################################################################################
export GPLINK_DEBUG=true

Turn off debugging:
export GPLINK_DEBUG=

Note: this will show all debug messages from gplstart, gpltable, and gpldrop.


Flag Counter

cgroup 术语和规则

$
0
0
cgroup是Linux下用于隔离或管理资源使用的手段。、
Redhat有比较详细的介绍。

首先需要了解几个术语,可以帮助理解cgroup的原理。
1. Subsystems: 即资源分类,例如cpu , memory, cpu, cpuset, blkio等。安装了cgconfig服务的话可以通过配置文件/etc/cgconfig.conf查看详情。

#
#  Copyright IBM Corporation. 2007
#
#  Authors:     Balbir Singh <balbir@linux.vnet.ibm.com>
#  This program is free software; you can redistribute it and/or modify it
#  under the terms of version 2.1 of the GNU Lesser General Public License
#  as published by the Free Software Foundation.
#
#  This program is distributed in the hope that it would be useful, but
#  WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See man cgconfig.conf for further details.
#
# By default, mount all controllers to /cgroup/<controller>

mount {
        cpuset  = /cgroup/cpuset;
        cpu     = /cgroup/cpu;
        cpuacct = /cgroup/cpuacct;
        memory  = /cgroup/memory;
        devices = /cgroup/devices;
        freezer = /cgroup/freezer;
        net_cls = /cgroup/net_cls;
        blkio   = /cgroup/blkio;
}


2. Hierarchies: 即资源管理的逻辑最高层级。hierachies通过mount来挂载,系统中可以挂载多个hierachies. 所有的进程pid最初都隶属于这个层级。 

3. Control Groups: 属于hierachie中的下级, cgroup中还可以继续创建cgroup, 这种cgroup下的cgroup并不代表层级关系,只是路径而已。cgroup在同一个hierachie中,是平级的,(不管这个cgroup放在哪个路径下, 例如/cgroup/cpu/cg1, /cgroup/cpu/cg2, /cgroup/cpu/cg1/cg3, cg1,cg2,cg3它们是平级的。)。

4. Tasks: 即进程pid.

规则讲解
使用cgroup进行资源管理时,必须遵循的规则如下:
规则1
单个hierarchy可以包含多个subsystem, 例如
mkdir /cgroup/cpu_mem_cg
mount -t cgroup -o cpu,memory cpu_mem_cg /cgroup/cpu_mem_cg
mkdir /cgroup/cpu_mem_cg/cg1
mkdir /cgroup/cpu_mem_cg/cg2
cgroup 术语和规则 - 德哥@Digoal - PostgreSQL research

规则2
一个subsystem只能属于一个hierarchy,(有一种情况一个subsystem可以属于多个hierarchy,这些hierarchy的sub system必须完全一样)
例如
mkdir /cgroup/cpu_cg
mkdir /cgroup/cpu_mem_cg
mount -t cgroup -o cpu cpu_cg /cgroup/cpu_cg
mount -t cgroup -o memory cpu_mem_cg /cgroup/cpu_mem_cg
再执行以下命令将报错, 因为cpu_mem_cg这个hierarchy已经包含了一个subsystem memory,所以CPU subsystem不能同时挂载在这两个hierarchy下。
mount -t cgroup -o cpu cpu_mem_cg /cgroup/cpu_mem_cg

有一种情况是允许一个subsystem挂载多个hierarchy下的,当这些hierarchy的subsystem完全一致时。例如
#mkdir /cgroup/cg1
#mkdir /cgroup/cg2
#mount -t cgroup -o cpu,memory cg1 /cgroup/cg1
#mount -t cgroup -o cpu,memory cg2 /cgroup/cg2
这样做并没有什么意义,因为同一个PID会同时出现在这两个hierarchy中。并且在这两个hierarchy中的任意一个中创建cgroup时,另一个hierarchy中也会自动创建对应的cgroup,还有当task从hierarchy调整到cgroup中时,另一个hierarchy中对应的PID也会自动调整到对应的cgroup中,这两个hierarchy是完全一致的。

cgroup 术语和规则 - 德哥@Digoal - PostgreSQL research

规则3
一个进程在同一个hierarchy中,不能属于这个hierarchy的多个cgroup中。
但是可以属于多个hierarchy中。
cgroup 术语和规则 - 德哥@Digoal - PostgreSQL research

规则4
进程fork的子进程,会自动放到父进程对应cgroup中。
cgroup 术语和规则 - 德哥@Digoal - PostgreSQL research

最后思考一个问题。
能同时控制一组进程的单个进程的IOPS和一组进程的总IOPS么?
例如
pid a
  iops <=3000
pid b
  iops <=3000
pid c
  iops <=3000
pid d
  iops <=3000
同时需要限制
pid a+b+c+d
  iops <= 8000

目前cgroup版本并不能满足这个需求。
又例如一组PID有10个PID,每个PID限制IOPS=100,但是要求10个进程同时使用的IOPS不能超过500. 
这个目前无法满足,因为cgroup在一个hierarchy中是平级的。
以下方法不行:
(同一个hierarchy中的不同cgroup,它们属于同一层级)
/cgroup/blkio/blkio.throttle.write_iops_device
    "8:16 5000"
/cgroup/blkio/tasks
    pid 非a,b

/cgroup/blkio/cg1/blkio.throttle.write_iops_device
    "8:16 3000"
/cgroup/blkio/cg1/tasks
    pid a
/cgroup/blkio/cg2/blkio.throttle.write_iops_device
    "8:16 3000"
/cgroup/blkio/cg2/tasks
    pid b

以下方法也不行:
(使用不同的hierarchy,但是同一个subsystem只能属于一个hierarchy,或者完全一致的hierarchy。)
/cgroup/blkio1/blkio.throttle.write_iops_device
    "8:16 3000"
/cgroup/blkio1/tasks
    pid a
/cgroup/blkio2/blkio.throttle.write_iops_device
    "8:16 3000"
/cgroup/blkio1/tasks
    pid b
/cgroup/blkio_all/blkio.throttle.write_iops_device
    "8:16 5000"
/cgroup/blkio1/tasks
    pid a,b

初始化Greenplum时使用2G或更大共享内存导致的gp_qd_proc_offset溢出错误

$
0
0
初始化GP集群时,如果你指定了共享内存为2GB或大于2GB,GP会将gp_qd_proc_offset设置为溢出INT的一个值,从而导致数据库启动时失败。
例如
gpinitsystem -c ./gpinitsystem_config --locale=C --max_connections=48 --shared_buffers=2GB --su_password=digoal
......
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-Process results...
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-----------------------------------------------------
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-   Successful segment starts                                            = 8
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-   Failed segment starts                                                = 0
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-   Skipped segment starts (segments are marked down in configuration)   = 0
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-----------------------------------------------------
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-Successfully started 8 of 8 segment instances 
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-----------------------------------------------------
20160215:08:11:19:083324 gpstart:digoal:digoal-[INFO]:-Starting Master instance digoal.sqa.zmf directory /disk1/digoal/gpdata/gpseg-1 
20160215:08:11:23:083324 gpstart:digoal:digoal-[INFO]:-Command pg_ctl reports Master digoal.sqa.zmf instance active
20160215:08:11:23:083324 gpstart:digoal:digoal-[WARNING]:-FATAL:  DTM initialization: failure during startup recovery, retry failed, check segment status (cdbtm.c:1603)

20160215:08:11:23:083324 gpstart:digoal:digoal-[INFO]:-No standby master configured.  skipping...
20160215:08:11:23:083324 gpstart:digoal:digoal-[INFO]:-Check status of database with gpstate utility
20160215:08:11:25:048361 gpinitsystem:digoal:digoal-[INFO]:-Completed restart of Greenplum instance in production mode
20160215:08:11:25:gpinitsystem:digoal:digoal-[FATAL]:-Failed to complete create database digoal  Script Exiting!
......
数据库日志
master
2016-02-15 08:13:47.275627 CST,"digoal","template1",p85680,th1235572672,"127.0.0.1","50183",2016-02-15 08:13:47 CST,1072,con5,,seg-1,,,x1072,sx1,"LOG","00000","Failed connection to seg0 digoal:40000",,,,,,,0,,"cdbgang.c",430
,
2016-02-15 08:13:47.275667 CST,"digoal","template1",p85680,th1235572672,"127.0.0.1","50183",2016-02-15 08:13:47 CST,1072,con5,,seg-1,,,x1072,sx1,"LOG","58M01","Master unable to connect to seg0 digoal:40000 with options : FAT
AL:  invalid value for parameter ""gp_qd_proc_offset"": ""2211046976""
HINT:  Value exceeds integer range.",,,,,,,0,,"cdbgang.c",437,

segment
2016-02-15 08:13:38.219721 CST,,,p85603,th619771840,"127.0.0.1","39921",2016-02-15 08:13:38 CST,0,,,seg-1,,,,,"LOG","00000","received transition request packet. processing the request",,,,,,,0,,"postmaster.c",2698,
2016-02-15 08:13:47.235698 CST,"digoal","template1",p85691,th619771840,"xxx.xxx.xxx.xxx","45197",2016-02-15 08:13:47 CST,985,con5,,seg0,,,x985,sx1,"FATAL","22023","invalid value for parameter ""gp_qd_proc_offset"": ""2211046976""",,"Value exceeds integer range.",,,,,0,,"guc.c",4627,

修复方法,重新设置数据库参数即可。
例如:
master
listen_addresses='0.0.0.0'
port=1921
max_connections = 48
shared_buffers = 512MB
max_prepared_transactions = 250
max_fsm_pages = 41943040
max_fsm_relations = 2621439
optimizer_analyze_root_partition = on
datestyle = 'iso, mdy'
lc_messages = 'C'
lc_monetary = 'C'
lc_numeric = 'C'
lc_time = 'C'
gp_resqueue_memory_policy = 'eager_free'
max_appendonly_tables = 10000
gp_interconnect_type=udpifc
gp_connections_per_thread = 64
gp_segment_connect_timeout = 600s
gp_vmem_protect_limit = 7000    
statement_mem = 6000
default_statistics_target = 100
gp_backup_directIO = on
gp_backup_directIO_read_chunk_mb = 20
log_statement=all
checkpoint_segments=512
gp_set_read_only=off
gp_workfile_limit_per_segment=40GB     
gp_statistics_use_fkeys=on
gp_vmem_protect_segworker_cache_limit=128
superuser_reserved_connections=6
tcp_keepalives_count=6
tcp_keepalives_idle=60
tcp_keepalives_interval=15
unix_socket_directory='.'
effective_cache_size=8GB
log_error_verbosity=verbose
log_connections=on
log_disconnections=on
log_autostats=on
stats_queue_level=on
extra_float_digits=2
gp_default_storage_options='appendonly=true, orientation=column'
gp_fts_probe_threadcount=48
gp_log_fts=verbose
wal_receiver_status_interval=1s
filerep_mirrorvalidation_during_resync=true

segment
listen_addresses='0.0.0.0'
port=40000     # ......                 
max_connections = 144
shared_buffers = 2048MB
max_prepared_transactions = 250
max_fsm_pages = 10485760
max_fsm_relations = 655359
optimizer_analyze_root_partition = on
datestyle = 'iso, mdy'
lc_messages = 'C'
lc_monetary = 'C'
lc_numeric = 'C'
lc_time = 'C'
gp_resqueue_memory_policy = 'eager_free'
max_appendonly_tables = 10000
gp_interconnect_type=udpifc
gp_connections_per_thread = 64
gp_segment_connect_timeout = 600s
gp_vmem_protect_limit = 7000 
statement_mem = 6000
default_statistics_target = 100
gp_backup_directIO = on
gp_backup_directIO_read_chunk_mb = 20
checkpoint_segments=512
gp_set_read_only=off
gp_workfile_limit_per_segment=40GB    
gp_statistics_use_fkeys=on
gp_vmem_protect_segworker_cache_limit=128
gp_resqueue_priority_cpucores_per_segment=1.5

kill -SIGINT ....关闭数据库
重新启动数据库

所以,建议使用小内存初始化集群,后期再通过修改参数的方式来更改shared buffer.
Flag Counter

给AliCloudDB PgSQL 找个伴 - 建立PostgreSQL的逻辑备库

$
0
0
适用于PostgreSQL数据库之间的逻辑增量同步。
对PostgreSQL的版本要求,8.3以上即可。
最小的同步单位为行,用户可以选择表为同步对象,并且可以对表进行分组(有事务关联的表作为一个分组)。
不同的分组,可以并行订阅,消费消息。
如图:
A,B,C三个表有事务关联,放到一个消息队列。
D,E,F三个表有事务关联,放到另一个消息队列。
在数据库中跟踪表的变更,记录到数据库的MQ中。
订阅者将MQ翻译成SQL,按照事务的提交顺序,在目标端回放这些SQL。从而达到逻辑复制的目的。
与MySQL的binlog 复制原理一致。
给AliCloudDB PgSQL 找个伴 - 建立PostgreSQL的逻辑备库 - 德哥@Digoal - PostgreSQL research

应用场景举例:
1. 多IDC之间的数据库同步。
2. 公有云的PostgreSQL和ECS中自建或用户自己的IDC中的数据库之间的数据同步。

实测性能:
单个分组支持大于8000条SQL每秒的同步速度。
并行执行的情况,支持大于2万条SQL每秒的同步速度。

包含代码的例子:
Flag Counter

LLVM 版PostgreSQL : Vitesse , 100TB级 OLTP+OLAP

$
0
0
Vitesse是PostgreSQL的社区改良版本,据称可以支持100TB级别的OLTP和OLAP应用场景。
在centos 6.x x64上的安装:

wget http://storage.googleapis.com/vitessedata/download/vitessedb.9.3.11.E.rh6.x86_64.32k.160218.bin
./vitessedb.9.3.11.E.rh6.x86_64.32k.160218.bin


我们看看vitesse的编译项有什么不同之处:
pg_config 

BINDIR = /home/digoal/vitessedb.9.3.11.E.160214/bin
DOCDIR = /home/digoal/vitessedb.9.3.11.E.160214/share/doc/postgresql
HTMLDIR = /home/digoal/vitessedb.9.3.11.E.160214/share/doc/postgresql
INCLUDEDIR = /home/digoal/vitessedb.9.3.11.E.160214/include
PKGINCLUDEDIR = /home/digoal/vitessedb.9.3.11.E.160214/include/postgresql
INCLUDEDIR-SERVER = /home/digoal/vitessedb.9.3.11.E.160214/include/postgresql/server
LIBDIR = /home/digoal/vitessedb.9.3.11.E.160214/lib
PKGLIBDIR = /home/digoal/vitessedb.9.3.11.E.160214/lib/postgresql
LOCALEDIR = /home/digoal/vitessedb.9.3.11.E.160214/share/locale
MANDIR = /home/digoal/vitessedb.9.3.11.E.160214/share/man
SHAREDIR = /home/digoal/vitessedb.9.3.11.E.160214/share/postgresql
SYSCONFDIR = /home/digoal/vitessedb.9.3.11.E.160214/etc/postgresql
PGXS = /home/digoal/vitessedb.9.3.11.E.160214/lib/postgresql/pgxs/src/makefiles/pgxs.mk
CONFIGURE = '--prefix=/opt/vitessedb.9.3.11.E' '--with-blocksize=32' '--enable-debug'
CC = gcc
CPPFLAGS = -D_GNU_SOURCE
CFLAGS = -O2 -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -g
CFLAGS_SL = -fpic
LDFLAGS = -L../../../src/common -Wl,--as-needed -Wl,-rpath,'/opt/vitessedb.9.3.11.E/lib',--enable-new-dtags
LDFLAGS_EX = 
LDFLAGS_SL = 
LIBS = -lpgport -lpgcommon -lz -lreadline -lcrypt -ldl -lm 
VERSION = PostgreSQL 9.3.11


less $PGHOME/lib/postgresql/pgxs/src/Makefile.global

# Saved arguments from configure
configure_args =  '--prefix=/opt/vitessedb.9.3.11.E' '--with-blocksize=32' '--enable-debug'

CC = gcc
GCC = yes
SUN_STUDIO_CC = no
CFLAGS = -O2 -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -g
CFLAGS_VECTOR =  -funroll-loops -ftree-vectorize

# Linking
AR = ar
DLLTOOL = 
DLLWRAP = 
LIBS = -lz -lreadline -lcrypt -ldl -lm 
LDAP_LIBS_FE = 
LDAP_LIBS_BE = 
OSSP_UUID_LIBS = 
LD = /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-redhat-linux/4.8.2/ld
with_gnu_ld = yes
ld_R_works = 


测试:
配置环境变量
vi env_vpg.sh 

export PS1="$USER@`/bin/hostname -s`-> "
export PGPORT=1933
export PGDATA=/disk1/digoal/pgdata/pg_root_vdb
export LANG=en_US.utf8
export PGHOME=/home/digoal/vitessedb
export LD_LIBRARY_PATH=$PGHOME/lib:$PGHOME/lib/postgresql:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGHOST=$PGDATA
export PGUSER=postgres
export PGDATABASE=postgres
alias rm='rm -i'
alias ll='ls -lh'
unalias vi


. env_vpg.sh

初始化数据库
initdb -D $PGDATA -U postgres -E UTF8 --locale=C -W

配置数据库参数

listen_addresses = '0.0.0.0'            # what IP address(es) to listen on;
port = 1933                             # (change requires restart)
max_connections = 100                   # (change requires restart)
unix_socket_directories = '.'   # comma-separated list of directories
tcp_keepalives_idle = 10                # TCP_KEEPIDLE, in seconds;
tcp_keepalives_interval = 60            # TCP_KEEPINTVL, in seconds;
tcp_keepalives_count = 10               # TCP_KEEPCNT;
shared_buffers = 32GB                   # min 128kB
work_mem = 32MB                         # min 64kB
maintenance_work_mem = 512MB            # min 1MB
bgwriter_delay = 10ms                   # 10-10000ms between rounds
bgwriter_lru_maxpages = 1000            # 0-1000 max buffers written/round
effective_io_concurrency = 5            # 1-1000; 0 disables prefetching
synchronous_commit = off                # synchronization level;
full_page_writes = off                  # recover from partial page writes
wal_buffers = 16MB                      # min 32kB, -1 sets based on shared_buffers
wal_writer_delay = 10ms         # 1-10000 milliseconds
checkpoint_segments = 2048              # in logfile segments, min 1, 16MB each
checkpoint_timeout = 55min              # range 30s-1h
checkpoint_completion_target = 0.9      # checkpoint target duration, 0.0 - 1.0
log_destination = 'csvlog'              # Valid values are combinations of
logging_collector = on          # Enable capturing of stderr and csvlog
log_checkpoints = on
log_connections = on
log_disconnections = on
log_error_verbosity = verbose           # terse, default, or verbose messages
log_timezone = 'PRC'
log_autovacuum_min_duration = 0 # -1 disables, 0 logs all actions and
datestyle = 'iso, mdy'
timezone = 'PRC'
lc_messages = 'C'                       # locale for system error message
lc_monetary = 'C'                       # locale for monetary formatting
lc_numeric = 'C'                        # locale for number formatting
lc_time = 'C'                           # locale for time formatting
default_text_search_config = 'pg_catalog.english'
local_preload_libraries = '' 


vitesse提供的参数如下:

 vitesse.enable       | on                                                                                  | Enable use of vitesse engine.
 vitesse.last         | 0                                                                                   | Show if vitesse engine was used in the last query.
 vitesse.license      | 20150501|568805|Vitesse DB|64|30-DAY LIMITED TRIAL|8ff5f56b69a4a9f50bd346768e673dfd | Show license info on vitesse engine.
 vitesse.log_level    | 0                                                                                   | Log vitesse engine messages: 1 for ERROR, 2 for NOTICE, 3 for INFO, 4 for VERBOSE.
 vitesse.rev          | 8eb0151                                                                             | Show revision info on vitesse engine.
 vitesse.support_dump |                                                                                     | Dump information for customer support.
 vitesse.thread       | 0                                                                                   | Number of thread used in current session.
 vitesse.threshold    | 200                                                                                 | Engage vitesse engine above this plan cost threshold.
 vitesse.version      | Vitesse DB 9.3.11 [Enterprise Edition, rev 8eb0151 on 2016-02-14]                   | Show version info on vitesse engine.

在测试对比vitesse和普通的PostgreSQL的性能差异,或者对比结果是否一致时,通过设置vitesse.enable来控制。

在download页有一个简单的测试:

\timing
postgres=# set vitesse.enable = 1;
postgres=# create table t as
    select generate_series(1,100000000)::bigint as i;
SELECT 100000000
Time: 50252.389 ms
postgres=# select count(*), sum(i*i), avg(i) from t;
   count   |        sum         |          avg          
-----------+--------------------+-----------------------
 100000000 | 672921401752298880 | 50000000.500000000000
(1 row)
Time: 385.597 ms
postgres=# select * from t where i = 10 or i = 20 or i = 30;
 i  
----
 10
 20
 30
(3 rows)
Time: 156.411 ms

postgres=# set vitesse.enable = 0;
SET
Time: 0.210 ms
postgres=# select count(*), sum(i*i), avg(i) from t;
   count   |           sum            |          avg          
-----------+--------------------------+-----------------------
 100000000 | 333333338333333350000000 | 50000000.500000000000
(1 row)
Time: 112403.852 ms
postgres=# select * from t where i = 10 or i = 20 or i = 30;
 i  
----
 10
 20
 30
(3 rows)
Time: 17367.400 ms

直接查询sets,按照VITESSE的说法,会使用普通的PostgreSQL代码进行处理,所以结果是正确的。
postgres=# set vitesse.enable =1;
SET
Time: 0.158 ms
postgres=# select sum(i*i) from generate_series(1,10000000::int8) t(i);
sum
-----------------------
333333383333335000000
(1 row)

Time: 6216.040 ms
postgres=# select sum(i*i) from generate_series(1,100000000::int8) t(i);
sum
--------------------------
333333338333333350000000
(1 row)

Time: 63096.960 ms

vitesse.enable=0和1的sum(i*i)查询结果不一样,是一个BUG。发邮件给support后马上就修复了,感谢田老师和ck。
下载新的版本后重新测试:

postgres=# \timing
Timing is on.
开启vitesse代码优化
postgres=# show vitesse.enable ;
 vitesse.enable 
----------------
 on
(1 row)
Time: 0.092 ms
postgres=# select sum(i*i) from generate_series(1,100000000::int8) t(i);
           sum            
--------------------------
 333333338333333350000000
(1 row)
Time: 62160.868 ms
postgres=# create table t as
postgres-#     select generate_series(1,100000000)::bigint as i;
SELECT 100000000
Time: 46557.467 ms
postgres=# select count(*), sum(i*i), avg(i) from t;
   count   |           sum            |          avg          
-----------+--------------------------+-----------------------
 100000000 | 333333338333333350000000 | 50000000.500000000000
(1 row)
Time: 368.098 ms
postgres=# select * from t where i = 10 or i = 20 or i = 30;
 i  
----
 10
 20
 30
(3 rows)
Time: 146.438 ms
关闭vitesse代码优化
postgres=# set vitesse.enable =0;
SET
Time: 0.146 ms
postgres=# select count(*), sum(i*i), avg(i) from t;
   count   |           sum            |          avg          
-----------+--------------------------+-----------------------
 100000000 | 333333338333333350000000 | 50000000.500000000000
(1 row)
Time: 109394.037 ms
postgres=# select * from t where i = 10 or i = 20 or i = 30;
 i  
----
 10
 20
 30
(3 rows)
Time: 18263.442 ms

开启vitesse代码优化的情况下,性能提升非常明显。
以下对比一下perf
未开启vitesse优化

1412.00  9.6% heapgettup_pagemode                    /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
1325.00  9.0% ExecMakeFunctionResultNoSets           /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 922.00  6.2% heapgetpage                            /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 658.00  4.5% slot_getattr                           /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 654.00  4.4% slot_deform_tuple                      /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 590.00  4.0% ExecQual                               /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 573.00  3.9% HeapTupleSatisfiesMVCC                 /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 406.00  2.7% ExecEvalScalarVarFast                  /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 371.00  2.5% ExecScan                               /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 305.00  2.1% ExecStoreTuple                         /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 233.00  1.6% XidInMVCCSnapshot                      /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 230.00  1.6% SeqNext                                /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 178.00  1.2% heap_getnext                           /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 172.00  1.2% MemoryContextReset                     /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 148.00  1.0% check_stack_depth                      /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 147.00  1.0% pgstat_init_function_usage             /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 124.00  0.8% pgstat_end_function_usage              /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 117.00  0.8% TransactionIdPrecedes                  /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 114.00  0.8% ExecEvalConst                          /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 113.00  0.8% _bt_compare                            /home/dege.zzz/vitessedb.9.3.11.E.160214/bin/postgres 
  98.00  0.7% hash_search_with_hash_value            /home/dege.zzz/vitessedb.9.3.11.E.160214/bin/postgres 
  86.00  0.6% int84lt                                /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres

开启vitesse优化

1435.00  6.1% parallelscan_task_fn(void*)                    /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 743.00  3.1% heapgetpage                                    /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 743.00  3.1% HeapTupleSatisfiesMVCC                         /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 213.00  0.9% XidInMVCCSnapshot                              /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres 
 123.00  0.5% _bt_compare                                    /home/dege.zzz/vitessedb.9.3.11.E.160214/bin/postgres 
  95.00  0.4% hash_search_with_hash_value                    /home/dege.zzz/vitessedb.9.3.11.E.160214/bin/postgres 
  94.00  0.4% TransactionIdPrecedes                          /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres
  66.00  0.3% CheckForSerializableConflictOut                /home/dege.zzz/vitessedb.9.3.11.E.160218/bin/postgres


在DOC页,可以看到vitesse只在某些情况下使用vitesse代码,其他情况下使用的都是原PostgreSQL的代码。

DDL
By design, other than CREATE TABLE AS SELECT, all DDL are handled by PostgreSQL.
除了create table  select 会由vitesse处理,其他DDL都是PostgreSQL的代码。

DML
By design, other than SELECT and INSERT, all DML clauses are handled by PostgreSQL.
除了select和insert,其他的DML都是PostgreSQL的代码。
For SELECT, the following specific cases are NOT supported by Vitesse Engine (but will be handled by PostgreSQL):

一下SQL也是PostgreSQL原代码在处理,with 递归,lead, lag窗口函数, hash, gist, gin索引, 返回sets的函数。
CTE with recursion, although CTE in general are supported.
lead and lag window functions, although other window functions are supported.
Hash, GiST and GIN index, although B-tree is supported.
Functions returning sets.


最后测试一下1.2 TB数据的TPCH:
针对vitesse需要优化以下:
1. 使用float8和mone代替numeric, varchar代替char,获得更好的性能。
2. 使用列存储,列压缩。
3. 修改后的tpch create .sql在本文末尾。

使用列存储引擎直接压测的TPCH结果:
2016-02-20 02:52:06 [1455907926] :   running query 1
2016-02-20 02:56:51 [1455908211] :     query 1 finished OK (275 seconds)
2016-02-20 02:56:52 [1455908212] :   running query 2
2016-02-20 05:40:54 [1455918054] :     query 2 finished OK (9812 seconds)  (ERROR:  Vitesse DB cannot pwrite to file)
2016-02-20 05:40:54 [1455918054] :   running query 3
2016-02-20 05:50:52 [1455918652] :     query 3 finished OK (585 seconds)
2016-02-20 05:50:52 [1455918652] :   running query 4
2016-02-20 05:58:16 [1455919096] :     query 4 finished OK (437 seconds)
2016-02-20 05:58:16 [1455919096] :   running query 5
2016-02-20 06:25:40 [1455920740] :     query 5 finished OK (1629 seconds)
2016-02-20 06:25:40 [1455920740] :   running query 6
2016-02-20 06:30:31 [1455921031] :     query 6 finished OK (144 seconds)
2016-02-20 06:30:31 [1455921032] :   running query 7
2016-02-20 07:05:45 [1455923145] :     query 7 finished OK (2103 seconds)
2016-02-20 07:05:45 [1455923145] :   running query 8
2016-02-20 07:27:34 [1455924454] :     query 8 finished OK (1275 seconds)
2016-02-20 07:27:34 [1455924454] :   running query 9
2016-02-20 08:03:32 [1455926612] :     query 9 finished OK (2147 seconds)
2016-02-20 08:03:32 [1455926612] :   running query 10
2016-02-20 08:11:52 [1455927112] :     query 10 finished OK (495 seconds)
2016-02-20 08:11:52 [1455927112] :   running query 11
2016-02-20 08:13:11 [1455927191] :     query 11 finished OK (77 seconds)
2016-02-20 08:13:11 [1455927191] :   running query 12
2016-02-20 08:19:07 [1455927547] :     query 12 finished OK (353 seconds)
2016-02-20 08:19:07 [1455927547] :   running query 13
2016-02-20 08:28:50 [1455928130] :     query 13 finished OK (579 seconds)
2016-02-20 08:28:50 [1455928130] :   running query 14
2016-02-20 08:31:18 [1455928278] :     query 14 finished OK (145 seconds)
2016-02-20 08:31:18 [1455928278] :   running query 15
2016-02-20 08:37:42 [1455928662] :     query 15 finished OK (380 seconds)
2016-02-20 08:37:42 [1455928662] :   running query 16
2016-02-20 08:50:28 [1455929428] :     query 16 finished OK (762 seconds)
2016-02-20 08:50:28 [1455929428] :   running query 17

使用堆表加索引压测的TPCH结果(比使用csf慢,测试到第六条SQL后未继续测试下去):
2016-02-21 11:10:48 [1456024248] :   running query 1
2016-02-21 11:17:22 [1456024642] :     query 1 finished OK (377 seconds)
2016-02-21 11:17:22 [1456024642] :   running query 2
2016-02-21 11:52:46 [1456026766] :     query 2 finished OK (2100 seconds) (ERROR:  Vitesse DB cannot pwrite to file
2016-02-21 11:52:46 [1456026766] :   running query 3
2016-02-21 13:01:25 [1456030885] :     query 3 finished OK (4077 seconds)
2016-02-21 13:01:25 [1456030885] :   running query 4
2016-02-21 13:25:23 [1456032323] :     query 4 finished OK (1433 seconds)
2016-02-21 13:25:23 [1456032323] :   running query 5
2016-02-21 13:48:17 [1456033697] :     query 5 finished OK (1368 seconds)
2016-02-21 13:48:17 [1456033697] :   running query 6
2016-02-21 17:03:58 [1456045438] :     query 6 finished OK (超过4小时未完成, 人工cancel掉)

对比同主机16 segments 的greenplum:
2016-02-16 13:38:39 [1455601119] :   running query 1  
2016-02-16 14:09:58 [1455602998] :     query 1 finished OK (1866 seconds)  
2016-02-16 14:09:58 [1455602998] :   running query 2  
2016-02-16 14:12:17 [1455603137] :     query 2 finished OK (137 seconds)  
2016-02-16 14:12:17 [1455603137] :   running query 3  
2016-02-16 14:26:19 [1455603979] :     query 3 finished OK (833 seconds)  
2016-02-16 14:26:19 [1455603979] :   running query 4  
2016-02-16 14:26:21 [1455603981] :     query 4 finished OK (1 seconds)  
2016-02-16 14:26:21 [1455603981] :   running query 5  
2016-02-16 14:26:24 [1455603984] :     query 5 finished OK (2 seconds)  
2016-02-16 14:26:24 [1455603984] :   running query 6  
2016-02-16 14:26:25 [1455603985] :     query 6 finished OK (0 seconds)  
2016-02-16 14:26:25 [1455603985] :   running query 7  
2016-02-16 14:56:33 [1455605793] :     query 7 finished OK (1796 seconds) (workfile空间不足退出)  
2016-02-16 14:56:33 [1455605793] :   running query 8  
2016-02-16 15:03:18 [1455606198] :     query 8 finished OK (403 seconds)  
2016-02-16 15:03:18 [1455606198] :   running query 9  
2016-02-16 15:30:09 [1455607809] :     query 9 finished OK (1604 seconds)  
2016-02-16 15:30:09 [1455607809] :   running query 10  
2016-02-16 15:30:12 [1455607812] :     query 10 finished OK (2 seconds)  
2016-02-16 15:30:12 [1455607812] :   running query 11  
2016-02-16 15:31:21 [1455607881] :     query 11 finished OK (68 seconds)  
2016-02-16 15:31:21 [1455607881] :   running query 12  
2016-02-16 15:31:23 [1455607883] :     query 12 finished OK (1 seconds)  
2016-02-16 15:31:23 [1455607883] :   running query 13  
2016-02-16 15:37:51 [1455608271] :     query 13 finished OK (385 seconds)  
2016-02-16 15:37:51 [1455608271] :   running query 14  
2016-02-16 15:37:53 [1455608273] :     query 14 finished OK (1 seconds)  
2016-02-16 15:37:53 [1455608273] :   running query 15  
2016-02-16 15:37:53 [1455608274] :     query 15 finished OK (0 seconds)  
2016-02-16 15:37:54 [1455608274] :   running query 16  
2016-02-16 15:41:04 [1455608464] :     query 16 finished OK (189 seconds)  
2016-02-16 15:41:04 [1455608464] :   running query 17  
2016-02-16 16:26:11 [1455611171] :     query 17 finished OK (2697 seconds)  
2016-02-16 16:26:11 [1455611171] :   running query 18  
2016-02-16 17:07:31 [1455613651] :     query 18 finished OK (2470 seconds)  
2016-02-16 17:07:31 [1455613651] :   running query 19  
2016-02-16 17:11:05 [1455613865] :     query 19 finished OK (210 seconds)  
2016-02-16 17:11:05 [1455613865] :   running query 20  
2016-02-16 17:11:08 [1455613868] :     query 20 finished OK (2 seconds)  
2016-02-16 17:11:08 [1455613868] :   running query 21  
2016-02-16 17:38:08 [1455615488] :     query 21 finished OK (1614 seconds)  
2016-02-16 17:38:08 [1455615488] :   running query 22  
2016-02-16 17:41:41 [1455615701] :     query 22 finished OK (211 seconds)  
2016-02-16 17:41:41 [1455615701] : finished TPC-H benchmark  

Deepgreenplum
2016-02-22 08:02:29 [1456099349] : running TPC-H benchmark
2016-02-22 08:02:29 [1456099349] : running queries defined in TPC-H benchmark
2016-02-22 08:02:29 [1456099349] :   running query 1
2016-02-22 08:05:44 [1456099544] :     query 1 finished OK (182 seconds)
2016-02-22 08:05:44 [1456099544] :   running query 2
2016-02-22 08:06:40 [1456099600] :     query 2 finished OK (54 seconds)
2016-02-22 08:06:40 [1456099600] :   running query 3
2016-02-22 08:16:10 [1456100170] :     query 3 finished OK (549 seconds)
2016-02-22 08:16:10 [1456100170] :   running query 4
2016-02-22 08:16:46 [1456100206] :     query 4 finished OK (18 seconds) (执行错误, segment crash)
2016-02-22 08:16:46 [1456100206] :   running query 5
2016-02-22 08:17:37 [1456100257] :     query 5 finished OK (19 seconds) (执行错误, 超出vm限制)
2016-02-22 08:17:37 [1456100257] :   running query 6
2016-02-22 08:18:04 [1456100284] :     query 6 finished OK (8 seconds)
2016-02-22 08:18:04 [1456100284] :   running query 7
2016-02-22 09:30:23 [1456104623] :     query 7 finished OK (4303 seconds) (执行错误, 临时空间不足)
2016-02-22 09:30:23 [1456104623] :   running query 8
2016-02-22 09:31:02 [1456104662] :     query 8 finished OK (21 seconds) (执行错误, 超出vm限制)
2016-02-22 09:31:02 [1456104662] :   running query 9
2016-02-22 09:31:41 [1456104701] :     query 9 finished OK (21 seconds) (执行错误, 超出vm限制)
2016-02-22 09:31:41 [1456104701] :   running query 10
2016-02-22 09:32:16 [1456104736] :     query 10 finished OK (17 seconds) (执行错误, segment crash)
2016-02-22 09:32:16 [1456104736] :   running query 11
2016-02-22 09:33:03 [1456104783] :     query 11 finished OK (37 seconds)
2016-02-22 09:33:03 [1456104783] :   running query 12
2016-02-22 09:33:35 [1456104815] :     query 12 finished OK (16 seconds) (执行错误, segment crash)
2016-02-22 09:33:35 [1456104815] :   running query 13
2016-02-22 09:38:48 [1456105128] :     query 13 finished OK (295 seconds)
2016-02-22 09:38:48 [1456105128] :   running query 14
2016-02-22 09:39:07 [1456105147] :     query 14 finished OK (9 seconds) (执行错误, segment crash)
2016-02-22 09:39:07 [1456105147] :   running query 15
2016-02-22 09:39:49 [1456105189] :     query 15 finished OK (16 seconds)
2016-02-22 09:39:49 [1456105189] :   running query 16
2016-02-22 09:41:45 [1456105305] :     query 16 finished OK (115 seconds)
2016-02-22 09:41:45 [1456105305] :   running query 17
2016-02-22 10:13:07 [1456107187] :     query 17 finished OK (1853 seconds)
2016-02-22 10:13:07 [1456107187] :   running query 18
2016-02-22 10:14:07 [1456107247] :     query 18 finished OK (30 seconds) (执行错误, 超出vm限制)
2016-02-22 10:14:07 [1456107247] :   running query 19
2016-02-22 10:16:55 [1456107415] :     query 19 finished OK (157 seconds)
2016-02-22 10:16:55 [1456107415] :   running query 20
2016-02-22 10:17:16 [1456107436] :     query 20 finished OK (10 seconds)
2016-02-22 10:17:16 [1456107436] :   running query 21
2016-02-22 10:18:34 [1456107514] :     query 21 finished OK (37 seconds) (执行错误, 超出vm限制)
2016-02-22 10:18:34 [1456107514] :   running query 22
2016-02-22 10:22:32 [1456107752] :     query 22 finished OK (228 seconds)
2016-02-22 10:22:32 [1456107752] : finished TPC-H benchmark

LLVM 版PostgreSQL : Vitesse , 100TB级 OLTP+OLAP - 德哥@Digoal - PostgreSQL research

因为vitesse使用的是9.3的版本,还没有FDW 条件下推的功能,所以没有使用分区表,优势不明显。
deepgreen有很多SQL因为有问题没有跑出来,已经跑出来的SQL则比GP都要快。
greenplum和deepgreen都使用了分区,orders表和lineitem在使用SQL条件字段对应的时间按月分区。

使用perf top观察一下是否使用了vitesse的代码。

[小结]
1.  使用perf可以观察时间都耗在哪里了,有没有使用vitesse的代码
2.  建议使用 float4, float8. 这些vitesse是改良过的,在bench.tgz的mktable里面被注释掉的类型应该都是没有改良的。例如char, numeric, decimal。
3. 使用bench.tgz,可以省去自己修改DDL的烦恼。
4. 使用bench.tgz,可以选择1或者10的scale,会从http://storage.googleapis.com/vitessedata/tpch10/ 或 tpch1/下载对应的file解压导入数据库。
例如
curl -s http://storage.googleapis.com/vitessedata/tpch10/customer.tbl.gz | gzip -d -c | psql tpch10f -c "COPY customer from stdin with csv delimiter '|'"
如果需要更大的数据量,请自己使用dbgen生成。导入数据库。
5. vitesse 支持并行查询,一条SQL可以将所有的系统资源,包括CPU和IO都用光,获得非常好的性能。
vitesse 

----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
  2   7  73  18   0   0|2522M    0 | 968B 1054B|   0     0 |  24k 7385 
  2   7  73  17   0   1|2529M   76k| 498B 1422B|   0     0 |  25k 7437 
  2   7  72  18   0   1|2526M    0 |  60B  138B|   0     0 |  25k 7881 
  2   7  70  20   0   1|2526M   32k|1057B 1725B|   0     0 |  27k 8387 
  2   8  70  19   0   1|2522M    0 | 244B  855B|   0     0 |  26k 8056 
  2  10  68  19   0   1|2527M    0 | 654B 1662B|   0     0 |  29k 8984 

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           2.01    0.00   10.59   19.88    0.00   67.52
Device:         rrqm/s   wrqm/s     r/s     w/s   rsec/s   wsec/s avgrq-sz avgqu-sz   await  svctm  %util
sdb               1.00     0.00 10126.00    0.00 5171088.00     0.00   510.67   140.43   13.86   0.10 100.00


postgresql社区版 

----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
  3   2  95   1   0   0|  24M  540k| 452B 1026B|   0     0 |6400  4400 
  3   2  95   1   0   0|  24M  540k| 635B 1179B|   0     0 |6637  4446 
  3   2  95   1   0   0|  24M  540k| 604B 1092B|   0     0 |6404  4391 
  3   2  95   1   0   0|  24M  560k|  60B  106B|   0     0 |6449  4421 
  3   2  94   1   0   0|  24M  624k| 452B  988B|   0     0 |7380  4751 
  3   2  95   1   0   0|  24M  540k| 272B  356B|   0     0 |7280  4634 
  3   2  94   1   0   0|  24M  540k|1477B 2772B|   0     0 |  11k 5432

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           2.92    0.00    1.88    0.75    0.00   94.46
Device:         rrqm/s   wrqm/s     r/s     w/s   rsec/s   wsec/s avgrq-sz avgqu-sz   await  svctm  %util
sdb               0.00     0.00  758.00    0.00 48512.00     0.00    64.00     0.22    0.30   0.29  22.30

6. 使用列存储和列压缩,可以获得更好的性能。
7. 目前vitesse支持的插件如下,(已经优化过的插件,以及外部表插件)

file_fdw  -- 文件外部表, 也经过了改良,比社区版本的postgresql快很多。
hstore
ltree
pg_stat_statements
plpgsql
postgres_fdw
vitesse_csf_fdw  -- 列存储外部表, 
vitesse_spq_fdw  -- parquet vpartd server 外部表

vitesse_csf_fdw | 1.0 | public | foreign-data wrapper for vitesse simple csf
vitesse_spq_fdw | 1.0 | csf | foreign-data wrapper for vitesse simple parquet vpartd server


[问题追踪]
1. 高并发的导入数据时出现了XFS异常, 导致数据库io hang
[9721742.424033] XFS: possible memory allocation deadlock in kmem_alloc (mode:0x250)
[9721794.660719] XFS: Internal error XFS_WANT_CORRUPTED_GOTO at line 1327 of file fs/xfs/xfs_alloc.c.  Caller 0xffffffffa04b8bfd
[9721794.660721] 
[9721794.673811] Pid: 17269, comm: xfsalloc/6 Not tainted 2.6.32-358.23.2.ali1195.el6.x86_64 #1
[9721794.682326] Call Trace:
[9721794.685035]  [<ffffffffa04e2b9f>] ? xfs_error_report+0x3f/0x50 [xfs]
[9721794.691620]  [<ffffffffa04b8bfd>] ? xfs_alloc_ag_vextent+0xad/0x100 [xfs]
[9721794.698653]  [<ffffffffa04b5c79>] ? xfs_alloc_lookup_eq+0x19/0x20 [xfs]
[9721794.705493]  [<ffffffffa04b7c8f>] ? xfs_alloc_ag_vextent_size+0x39f/0x6e0 [xfs]
[9721794.713081]  [<ffffffffa04b8bfd>] ? xfs_alloc_ag_vextent+0xad/0x100 [xfs]
[9721794.720121]  [<ffffffffa04b966c>] ? xfs_alloc_vextent+0x2dc/0x620 [xfs]
[9721794.726979]  [<ffffffffa04c44e2>] ? xfs_bmap_btalloc+0x5f2/0x700 [xfs]
[9721794.733734]  [<ffffffffa04c45fe>] ? xfs_bmap_alloc+0xe/0x10 [xfs]
[9721794.740056]  [<ffffffffa04c46da>] ? xfs_bmapi_allocate_worker+0x4a/0x80 [xfs]
[9721794.747467]  [<ffffffffa04c4690>] ? xfs_bmapi_allocate_worker+0x0/0x80 [xfs]
[9721794.754792]  [<ffffffff81094060>] ? worker_thread+0x170/0x2d0
[9721794.760790]  [<ffffffff8109a310>] ? autoremove_wake_function+0x0/0x40
[9721794.767443]  [<ffffffff81093ef0>] ? worker_thread+0x0/0x2d0
[9721794.773229]  [<ffffffff81099f76>] ? kthread+0x96/0xa0
[9721794.778526]  [<ffffffff8100d0ca>] ? child_rip+0xa/0x20
[9721794.783876]  [<ffffffff81099ee0>] ? kthread+0x0/0xa0
[9721794.789065]  [<ffffffff8100d0c0>] ? child_rip+0x0/0x20
[9721794.794430] XFS (sdb1): page discard on page ffffea008772a808, inode 0x1f0a09108, offset 1716993794048.
[9728270.587283] XFS: possible memory allocation deadlock in kmem_alloc (mode:0x250)
[9728273.293897] XFS: possible memory allocation deadlock in kmem_alloc (mode:0x250)
[9728278.743195] XFS: possible memory allocation deadlock in kmem_alloc (mode:0x250)
[9730071.343972] XFS: possible memory allocation deadlock in kmem_alloc (mode:0x250)

[vitesse tpch ddl]

CREATE EXTENSION vitesse_csf_fdw;
CREATE SERVER csf_fdw_server FOREIGN DATA WRAPPER vitesse_csf_fdw;

-- create tables. use lz4 compression for comments.
CREATE FOREIGN TABLE NATION (
N_NATIONKEY bigint NOT NULL,
N_NAME VARCHAR(25) /* CHAR(25) */  NOT NULL,
N_REGIONKEY bigint NOT NULL,
N_COMMENT VARCHAR(152) OPTIONS (compression 'lz4') 
)
server csf_fdw_server;

CREATE FOREIGN TABLE REGION (
R_REGIONKEY bigint NOT NULL,
R_NAME VARCHAR(25) /* CHAR(25) */ NOT NULL,
R_COMMENT VARCHAR(152) OPTIONS (compression 'lz4')
)
server csf_fdw_server;

CREATE FOREIGN TABLE PART (
P_PARTKEY bigint NOT NULL,
P_NAME VARCHAR(55) NOT NULL,
P_MFGR VARCHAR(25) /* CHAR(25) */ NOT NULL,
P_BRAND VARCHAR(10) /* CHAR(10) */ NOT NULL,
P_TYPE VARCHAR(25) NOT NULL,
P_SIZE bigint NOT NULL,
P_CONTAINER VARCHAR(10) /* CHAR(10) */ NOT NULL,
P_RETAILPRICE FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
P_COMMENT VARCHAR(23) OPTIONS (compression 'lz4') NOT NULL
)
server csf_fdw_server;

CREATE FOREIGN TABLE SUPPLIER ( 
S_SUPPKEY bigint NOT NULL,
S_NAME VARCHAR(25) /* CHAR(25) */ NOT NULL,
S_ADDRESS VARCHAR(40) NOT NULL,
S_NATIONKEY bigint NOT NULL,
S_PHONE VARCHAR(15) /* CHAR(15) */ NOT NULL,
S_ACCTBAL FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
S_COMMENT VARCHAR(101) OPTIONS (compression 'lz4') NOT NULL
)
server csf_fdw_server;

CREATE FOREIGN TABLE PARTSUPP(
PS_PARTKEY bigint NOT NULL,
PS_SUPPKEY bigint NOT NULL,
PS_AVAILQTY bigint NOT NULL,
PS_SUPPLYCOST FLOAT8 /* DECIMAL(15,2) */  NOT NULL,
PS_COMMENT VARCHAR(199) OPTIONS (compression 'lz4') NOT NULL
)
server csf_fdw_server;

CREATE FOREIGN TABLE CUSTOMER(
C_CUSTKEY bigint NOT NULL,
C_NAME VARCHAR(25) NOT NULL,
C_ADDRESS VARCHAR(40) NOT NULL,
C_NATIONKEY bigint NOT NULL,
C_PHONE VARCHAR(15) /* CHAR(15) */ NOT NULL,
C_ACCTBAL float8 /* DECIMAL(15,2) */  NOT NULL,
C_MKTSEGMENT VARCHAR(10) /* CHAR(10) */ NOT NULL,
C_COMMENT VARCHAR(117) OPTIONS (compression 'lz4')NOT NULL
)
server csf_fdw_server;

CREATE FOREIGN TABLE ORDERS (
O_ORDERKEY bigint NOT NULL,
O_CUSTKEY bigint NOT NULL,
O_ORDERSTATUS VARCHAR(1) /* CHAR(1) */ NOT NULL,
O_TOTALPRICE float8 /* DECIMAL(15,2) */ NOT NULL,
O_ORDERDATE DATE NOT NULL,
O_ORDERPRIORITY VARCHAR(15) /* CHAR(15) */ NOT NULL,
O_CLERK VARCHAR(15) /* CHAR(15) */ NOT NULL,
O_SHIPPRIORITY bigint NOT NULL,
O_COMMENT VARCHAR(79) OPTIONS (compression 'lz4') NOT NULL
)
server csf_fdw_server;

CREATE FOREIGN TABLE LINEITEM(
L_ORDERKEY bigint NOT NULL,
L_PARTKEY bigint NOT NULL,
L_SUPPKEY bigint NOT NULL,
L_LINENUMBER bigint NOT NULL,
L_QUANTITY bigint /* DECIMAL(15,2) */ NOT NULL,
L_EXTENDEDPRICE FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
L_DISCOUNT DOUBLE PRECISION /* DECIMAL(15,2) */ NOT NULL,
L_TAX DOUBLE PRECISION /* DECIMAL(15,2) */ NOT NULL,
L_RETURNFLAG VARCHAR(1) /* CHAR(1) */ NOT NULL,
L_LINESTATUS VARCHAR(1) /* CHAR(1) */ NOT NULL,
L_SHIPDATE DATE NOT NULL,
L_COMMITDATE DATE NOT NULL,
L_RECEIPTDATE DATE NOT NULL,
L_SHIPINSTRUCT VARCHAR(25) /* CHAR(25) */ NOT NULL,
L_SHIPMODE VARCHAR(10) /* CHAR(10) */ NOT NULL,
L_COMMENT VARCHAR(44) OPTIONS (compression'lz4') NOT NULL
)
server csf_fdw_server;

csf外部表需要显示的analyze.

analyze customer;
analyze lineitem;
analyze nation;
analyze orders;
analyze part;
analyze partsupp;
analyze region;
analyze supplier;

堆表以及建立索引的DDL:


CREATE TABLE NATION (
N_NATIONKEY bigint NOT NULL,
N_NAME VARCHAR(25) /* CHAR(25) */ NOT NULL,
N_REGIONKEY bigint NOT NULL,
N_COMMENT VARCHAR(152)
);

CREATE TABLE REGION (
R_REGIONKEY bigint NOT NULL,
R_NAME VARCHAR(25) /* CHAR(25) */ NOT NULL,
R_COMMENT VARCHAR(152)
);

CREATE TABLE PART (
P_PARTKEY bigint NOT NULL,
P_NAME VARCHAR(55) NOT NULL,
P_MFGR VARCHAR(25) /* CHAR(25) */ NOT NULL,
P_BRAND VARCHAR(10) /* CHAR(10) */ NOT NULL,
P_TYPE VARCHAR(25) NOT NULL,
P_SIZE bigint NOT NULL,
P_CONTAINER VARCHAR(10) /* CHAR(10) */ NOT NULL,
P_RETAILPRICE FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
P_COMMENT VARCHAR(23)NOT NULL
);

CREATE TABLE SUPPLIER (
S_SUPPKEY bigint NOT NULL,
S_NAME VARCHAR(25) /* CHAR(25) */ NOT NULL,
S_ADDRESS VARCHAR(40) NOT NULL,
S_NATIONKEY bigint NOT NULL,
S_PHONE VARCHAR(15) /* CHAR(15) */ NOT NULL,
S_ACCTBAL FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
S_COMMENT VARCHAR(101)NOT NULL
);

CREATE TABLE PARTSUPP(
PS_PARTKEY bigint NOT NULL,
PS_SUPPKEY bigint NOT NULL,
PS_AVAILQTY bigint NOT NULL,
PS_SUPPLYCOST FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
PS_COMMENT VARCHAR(199)NOT NULL
);

CREATE TABLE CUSTOMER(
C_CUSTKEY bigint NOT NULL,
C_NAME VARCHAR(25) NOT NULL,
C_ADDRESS VARCHAR(40) NOT NULL,
C_NATIONKEY bigint NOT NULL,
C_PHONE VARCHAR(15) /* CHAR(15) */ NOT NULL,
C_ACCTBAL float8 /* DECIMAL(15,2) */ NOT NULL,
C_MKTSEGMENT VARCHAR(10) /* CHAR(10) */ NOT NULL,
C_COMMENT VARCHAR(117)NOT NULL
);

CREATE TABLE ORDERS (
O_ORDERKEY bigint NOT NULL,
O_CUSTKEY bigint NOT NULL,
O_ORDERSTATUS VARCHAR(1) /* CHAR(1) */ NOT NULL,
O_TOTALPRICE float8 /* DECIMAL(15,2) */ NOT NULL,
O_ORDERDATE DATE NOT NULL,
O_ORDERPRIORITY VARCHAR(15) /* CHAR(15) */ NOT NULL,
O_CLERK VARCHAR(15) /* CHAR(15) */ NOT NULL,
O_SHIPPRIORITY bigint NOT NULL,
O_COMMENT VARCHAR(79)NOT NULL
);

CREATE TABLE LINEITEM(
L_ORDERKEY bigint NOT NULL,
L_PARTKEY bigint NOT NULL,
L_SUPPKEY bigint NOT NULL,
L_LINENUMBER bigint NOT NULL,
L_QUANTITY bigint /* DECIMAL(15,2) */ NOT NULL,
L_EXTENDEDPRICE FLOAT8 /* DECIMAL(15,2) */ NOT NULL,
L_DISCOUNT DOUBLE PRECISION /* DECIMAL(15,2) */ NOT NULL,
L_TAX DOUBLE PRECISION /* DECIMAL(15,2) */ NOT NULL,
L_RETURNFLAG VARCHAR(1) /* CHAR(1) */ NOT NULL,
L_LINESTATUS VARCHAR(1) /* CHAR(1) */ NOT NULL,
L_SHIPDATE DATE NOT NULL,
L_COMMITDATE DATE NOT NULL,
L_RECEIPTDATE DATE NOT NULL,
L_SHIPINSTRUCT VARCHAR(25) /* CHAR(25) */ NOT NULL,
L_SHIPMODE VARCHAR(10) /* CHAR(10) */ NOT NULL,
L_COMMENT VARCHAR(44) NOT NULL
);

ALTER TABLE PART ADD PRIMARY KEY (P_PARTKEY);
ALTER TABLE SUPPLIER ADD PRIMARY KEY (S_SUPPKEY);
ALTER TABLE PARTSUPP ADD PRIMARY KEY (PS_PARTKEY, PS_SUPPKEY);
ALTER TABLE CUSTOMER ADD PRIMARY KEY (C_CUSTKEY);
ALTER TABLE ORDERS ADD PRIMARY KEY (O_ORDERKEY);
ALTER TABLE NATION ADD PRIMARY KEY (N_NATIONKEY);
ALTER TABLE REGION ADD PRIMARY KEY (R_REGIONKEY);

CREATE INDEX IDX_SUPPLIER_NATION_KEY ON SUPPLIER (S_NATIONKEY);
CREATE INDEX IDX_PARTSUPP_PARTKEY ON PARTSUPP (PS_PARTKEY);
CREATE INDEX IDX_PARTSUPP_SUPPKEY ON PARTSUPP (PS_SUPPKEY);
CREATE INDEX IDX_CUSTOMER_NATIONKEY ON CUSTOMER (C_NATIONKEY);
CREATE INDEX IDX_ORDERS_CUSTKEY ON ORDERS (O_CUSTKEY);
CREATE INDEX IDX_NATION_REGIONKEY ON NATION (N_REGIONKEY);
CREATE INDEX IDX_ORDERS_ORDERDATE ON ORDERS (O_ORDERDATE);

ALTER TABLE LINEITEM ADD PRIMARY KEY (L_ORDERKEY, L_LINENUMBER);

CREATE INDEX IDX_LINEITEM_ORDERKEY ON LINEITEM (L_ORDERKEY);
CREATE INDEX IDX_LINEITEM_PART_SUPP ON LINEITEM (L_PARTKEY,L_SUPPKEY);
CREATE INDEX IDX_LINEITEM_SHIPDATE ON LINEITEM (L_SHIPDATE, L_DISCOUNT, L_QUANTITY);


为什么Greenplum 的CPU有大量是%ni的占用

$
0
0

在使用Greenplum的过程中,发现CPU监控有大量的%ni的占比。
ni是指低优先级的用户模式,通过setpriority可以设置进程的优先级。数字越大,优先级越低。
TOP中CPU统计的分类解释如下:

   2c. SUMMARY Area Fields
       The summary area fields describing CPU statistics are abbreviated.  They provide information about times spent in:
           us = user mode
           sy = system mode
           ni = low priority user mode (nice)
           id = idle task
           wa = I/O waiting
           hi = servicing IRQs
           si = servicing soft IRQs
           st = steal (time given to other DomU instances)

setpriority 用法如下:

NAME
       getpriority, setpriority - get/set program scheduling priority

SYNOPSIS
       #include <sys/time.h>
       #include <sys/resource.h>

       int getpriority(int which, int who);
       int setpriority(int which, int who, int prio);

查看greenplum的源码,在src/backend/tcop/postgres.c中发现了setpriority的踪迹。

/*
 * Change the priority of the current process to the specified level
 * (bigger nice_level values correspond to lower priority).
*/
static bool renice_current_process(int nice_level)
{
#ifdef WIN32
        elog(DEBUG2, "Renicing of processes on Windows currently not supported.");
        return false;
#else
        int prio_out = -1;
        elog(DEBUG2, "Current nice level of the process: %d",
                        getpriority(PRIO_PROCESS, 0));
        prio_out = setpriority(PRIO_PROCESS, 0, nice_level);
        if (prio_out == -1)
        {
                switch (errno)
                {
                case EACCES:
                        elog(DEBUG1, "Could not change priority of the query process, errno: %d (%m).", errno);
                        break;
                case ESRCH:
                        /* ignore this, the backend went away when we weren't looking */
                        break;
                default:
                        elog(DEBUG1, "Could not change priority of the query process, errno: %d (%m).", errno);
                }
                return false;
        }

        elog(DEBUG2, "Reniced process to level %d", getpriority(PRIO_PROCESS, 0));
        return true;
#endif
}

以上函数在exec_mpp_query时被调用:

/*
 * exec_mpp_query
 *
 * Called in a qExec process to read and execute a query plan sent by
 * cdbdisp_dispatchPlan().
 *
 * query_string -- optional query text (C string).
 * serializedQuerytree[len]  -- Query node or (NULL,0) if plan provided.
 * serializedPlantree[len] -- PlannedStmt node, or (NULL,0) if query provided.
 * serializedParms[len] -- optional parameters
 * serializedSliceInfo[len] -- optional SliceTable
 * localSlice -- slice table index
 *
 * Caller may supply either a Query (representing utility command) or
 * a PlannedStmt (representing a planned DML command), but not both.
 */
static void
exec_mpp_query(const char *query_string, 
                           const char * serializedQuerytree, int serializedQuerytreelen,
                           const char * serializedPlantree, int serializedPlantreelen,
                           const char * serializedParams, int serializedParamslen,
                           const char * serializedSliceInfo, int serializedSliceInfolen,
                           const char * seqServerHost, int seqServerPort,
                           int localSlice)
{
...
        /* Downgrade segworker process priority */
                if (gp_segworker_relative_priority != 0)
                {
                        renice_current_process(PostmasterPriority + gp_segworker_relative_priority);
                }

gp_segworker_relative_priority 是一个启动参数,默认是20:

        {
                {"gp_segworker_relative_priority", PGC_POSTMASTER, RESOURCES_MGM,
                        gettext_noop("Priority for the segworkers relative to the postmaster's priority."),
                        NULL,
                        GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
                },
                &gp_segworker_relative_priority,
                PRIO_MAX,
                0, PRIO_MAX, NULL, NULL
        },
...

src/include/cdb/cdbvars.h:#define PRIO_MAX 20

因此就能解释,为什么使用greenplum会发现大量的nice占比的CPU统计。

Flag Counter

PostgreSQL 9.5 新特性之 - 水平分片架构与实践

$
0
0

PostgreSQL 9.5 在foreign data wrapper这方面有了几块非常好的功能增强:
1. 支持创建外部表约束
2. 优化器支持外部表查询下推
3. 外部表支持继承
4. 支持import foreign schema一键创建外部表
使用前3点增强的技术点可以支持数据库的水平分片,可以把PostgreSQL 9.5作为master,在其他版本作为数据节点。从而实现水平分库的目的。

这种分片技术相比中间件分片技术的好处:
1. 支持跨库JOIN
2. 支持绑定变量
3. 支持ACID
4. 支持分布式事务(不需要用户干预2PC)
5. 支持master节点水平扩展
6. 支持segment节点水平扩展

7. 支持函数和存储过程  


架构如图:
架构1:
维度表存储在上游节点,好处是JOIN时效率更高,缺点是当上层节点需要部署多个时,需要在上层节点之间同步维度表(可以使用前端同步或后端同步的方法),同时如果维度数据量大或者DML频繁的话master库会比较重。
前端同步可以使用外部表的方式,使用触发器实现。insert,update,delete通过父表触发在所有子表执行。SELECT时查询本地表不走触发器。
后端同步可以使用LOGICAL DECODE。
_
架构2:
维度表存储在分片节点,可以选择任一分片节点存储任一维度表。
缺点是JOIN时数据需要传到上层节点,效率低。
还有一个缺点是数据节点可能不平衡。

当维度表使用多副本时,不存在不平横的问题。这种情况下查询指定任意节点,增删改则指定所有节点。
_1
HA架构:
_2

上层节点使用9.5以上的版本,用于存放维度表(全局表),以及数据分片的定义,数据路由算法。
数据分片的定义包括继承关系,分布列约束,外部表定义,插入触发器。
下层节点使用的数据库版本没有要求,用于存放数据分片。

下面是一组测试,创建5个数据库,1个master库用于存放全局数据和数据分片的定义,数据路由算法;4个下层节点数据库,用于存放数据分片;

master=# create database db0;  
master=# create database db1;  
master=# create database db2;  
master=# create database db3;  
master=# create database master;  

连接到master库,创建外部server:

d3=# \c master  
master=# create extension postgres_fdw;  
master=# create server db0 foreign data wrapper postgres_fdw options (hostaddr '127.0.0.1', port '1923', dbname 'db0');  
master=# create server db1 foreign data wrapper postgres_fdw options (hostaddr '127.0.0.1', port '1923', dbname 'db1');  
master=# create server db2 foreign data wrapper postgres_fdw options (hostaddr '127.0.0.1', port '1923', dbname 'db2');  
master=# create server db3 foreign data wrapper postgres_fdw options (hostaddr '127.0.0.1', port '1923', dbname 'db3');  

创建user mapping:

master=# create user mapping for postgres server db0 options (user 'postgres', password 'postgres');  
master=# create user mapping for postgres server db1 options (user 'postgres', password 'postgres');  
master=# create user mapping for postgres server db2 options (user 'postgres', password 'postgres');  
master=# create user mapping for postgres server db3 options (user 'postgres', password 'postgres');  

连接到分片节点,创建分片表(表名请随意):

master=# \c db0  
db0=# create table tbl0(id int primary key, info text, crt_time timestamp);  
alter table tbl0 add constraint ck1 check (abs(mod(id,4))=0);  
master=# \c db1  
db1=# create table tbl1(id int primary key, info text, crt_time timestamp);  
alter table tbl1 add constraint ck1 check (abs(mod(id,4))=1);  
master=# \c db2  
db2=# create table tbl2(id int primary key, info text, crt_time timestamp);  
alter table tbl2 add constraint ck1 check (abs(mod(id,4))=2);  
master=# \c db3  
db3=# create table tbl3(id int primary key, info text, crt_time timestamp);  
alter table tbl3 add constraint ck1 check (abs(mod(id,4))=3);  

连接到主节点,创建外部表,这里使用了import foreign schema语法,一键创建:

db3=# \c master  
You are now connected to database "master" as user "postgres".  
master=# import FOREIGN SCHEMA public from server db0 into public;   
IMPORT FOREIGN SCHEMA  
master=# import FOREIGN SCHEMA public from server db1 into public;   
IMPORT FOREIGN SCHEMA  
master=# import FOREIGN SCHEMA public from server db2 into public;   
IMPORT FOREIGN SCHEMA  
master=# import FOREIGN SCHEMA public from server db3 into public;   
IMPORT FOREIGN SCHEMA  
master=# \det  
 List of foreign tables  
 Schema | Table | Server   
--------+-------+--------  
 public | tbl0  | db0  
 public | tbl1  | db1  
 public | tbl2  | db2  
 public | tbl3  | db3  
(4 rows)  

创建主表,用户操作主表即可。(当然用户也可以直接操作子表,PostgreSQL不拦你)

master=# create table tbl(id int, info text, crt_time timestamp);  
CREATE TABLE  

设置外部表继承关系,继承到主表下面。

master=# alter foreign table tbl0 inherit tbl;  
ALTER FOREIGN TABLE  
master=# alter foreign table tbl1 inherit tbl;  
ALTER FOREIGN TABLE  
master=# alter foreign table tbl2 inherit tbl;  
ALTER FOREIGN TABLE  
master=# alter foreign table tbl3 inherit tbl;  
ALTER FOREIGN TABLE  

创建外部表的约束,约束即路由算法的一部分。
注意,带约束条件的SQL,数据库会自动选择对应的外部表进行操作。
不带约束条件的SQL,数据库会选择所有节点操作。
所以建议每条SQL都带上约束条件。

master=# alter foreign table tbl0 add constraint ck_tbl0 check (abs(mod(id,4))=0);  
ALTER FOREIGN TABLE  
master=# alter foreign table tbl1 add constraint ck_tbl1 check (abs(mod(id,4))=1);  
ALTER FOREIGN TABLE  
master=# alter foreign table tbl2 add constraint ck_tbl2 check (abs(mod(id,4))=2);  
ALTER FOREIGN TABLE  
master=# alter foreign table tbl3 add constraint ck_tbl3 check (abs(mod(id,4))=3);  
ALTER FOREIGN TABLE  

带约束条件abs(mod(id, 4)) = (abs(mod(100, 4)))的SQL,选择了对应的外部表进行操作。

master=# explain select * from tbl where id=100 and abs(mod(id, 4)) = (abs(mod(100, 4)));  
                            QUERY PLAN                               
-------------------------------------------------------------------  
 Append  (cost=0.00..134.10 rows=2 width=44)  
   ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=44)  
         Filter: ((id = 100) AND (abs(mod(id, 4)) = 0))  
   ->  Foreign Scan on tbl0  (cost=100.00..134.10 rows=1 width=44)  
(4 rows)  

master=# explain select * from tbl where id=101 and abs(mod(id, 4)) = (abs(mod(101, 4)));  
                            QUERY PLAN                               
-------------------------------------------------------------------  
 Append  (cost=0.00..134.10 rows=2 width=44)  
   ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=44)  
         Filter: ((id = 101) AND (abs(mod(id, 4)) = 1))  
   ->  Foreign Scan on tbl1  (cost=100.00..134.10 rows=1 width=44)  
(4 rows)  

不带约束条件abs(mod(id, 4)) = (abs(mod(100, 4)))的SQL,选择了所有外部表进行操作。

master=# explain select * from tbl where id=100;  
                            QUERY PLAN                               
-------------------------------------------------------------------  
 Append  (cost=0.00..500.68 rows=25 width=44)  
   ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=44)  
         Filter: (id = 100)  
   ->  Foreign Scan on tbl0  (cost=100.00..125.17 rows=6 width=44)  
   ->  Foreign Scan on tbl1  (cost=100.00..125.17 rows=6 width=44)  
   ->  Foreign Scan on tbl2  (cost=100.00..125.17 rows=6 width=44)  
   ->  Foreign Scan on tbl3  (cost=100.00..125.17 rows=6 width=44)  
(7 rows)  

master=# explain select count(*),sum(id),avg(id+id) from tbl;  
                                QUERY PLAN                                   
---------------------------------------------------------------------------  
 Aggregate  (cost=908.01..908.02 rows=1 width=4)  
   ->  Append  (cost=0.00..791.00 rows=11701 width=4)  
         ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=4)  
         ->  Foreign Scan on tbl0  (cost=100.00..197.75 rows=2925 width=4)  
         ->  Foreign Scan on tbl1  (cost=100.00..197.75 rows=2925 width=4)  
         ->  Foreign Scan on tbl2  (cost=100.00..197.75 rows=2925 width=4)  
         ->  Foreign Scan on tbl3  (cost=100.00..197.75 rows=2925 width=4)  
(7 rows)  

创建插入路由触发器函数:

master=# create or replace function f_tbl_ins() returns trigger as $$  
declare  
begin  
  case abs(mod(NEW.id, 4))   
    when 0 then  
      insert into tbl0 (id, info, crt_time) values (NEW.*);  
    when 1 then  
      insert into tbl1 (id, info, crt_time) values (NEW.*);  
    when 2 then  
      insert into tbl2 (id, info, crt_time) values (NEW.*);  
    when 3 then  
      insert into tbl3 (id, info, crt_time) values (NEW.*);  
    else  
      return null;  
  end case;  
    return null;  
end;  
$$ language plpgsql;  

创建插入触发器:

master=# create trigger tg1 before insert on tbl for each row execute procedure f_tbl_ins();  
CREATE TRIGGER  

测试插入路由是否正确:

master=# insert into tbl values (1,'abc',now());  
INSERT 0 0  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
(1 row)  

master=# insert into tbl values (2,'abc',now());  
INSERT 0 0  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
  2 | abc  | 2016-02-23 09:17:55.065332  
(2 rows)  

master=# select * from tbl where id=2;  
 id | info |          crt_time            
----+------+----------------------------  
  2 | abc  | 2016-02-23 09:17:55.065332  
(1 row)  

master=# select * from tbl where id=2 and abs(mod(id, 4))=abs(mod(2, 4));  
 id | info |          crt_time            
----+------+----------------------------  
  2 | abc  | 2016-02-23 09:17:55.065332  
(1 row)  

带约束条件abs(mod(id, 4)) = (abs(mod(100, 4)))的SQL,选择了对应的外部表进行操作。

master=# explain select * from tbl where id=2 and abs(mod(id, 4))=abs(mod(2, 4));  
                            QUERY PLAN                               
-------------------------------------------------------------------  
 Append  (cost=0.00..134.10 rows=2 width=44)  
   ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=44)  
         Filter: ((id = 2) AND (abs(mod(id, 4)) = 2))  
   ->  Foreign Scan on tbl2  (cost=100.00..134.10 rows=1 width=44)  
(4 rows)  

空值操作,在触发器中已经绕过:

master=# insert into tbl values (null,'abc',now());  
INSERT 0 0  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
  2 | abc  | 2016-02-23 09:17:55.065332  
(2 rows)  

支持分布式事务,看执行计划,远程用了for update,然后在上层先提交远程节点,再提交本地节点:

master=# explain (verbose) update tbl set info='new' where id=2 and abs(mod(id, 4))=abs(mod(2, 4));  
                                                       QUERY PLAN                                                         
------------------------------------------------------------------------------------------------------------------------  
 Update on public.tbl  (cost=0.00..149.02 rows=2 width=18)  
   Update on public.tbl  
   Foreign Update on public.tbl2  
     Remote SQL: UPDATE public.tbl2 SET info = $2 WHERE ctid = $1  
   ->  Seq Scan on public.tbl  (cost=0.00..0.00 rows=1 width=18)  
         Output: tbl.id, 'new'::text, tbl.crt_time, tbl.ctid  
         Filter: ((tbl.id = 2) AND (abs(mod(tbl.id, 4)) = 2))  
   ->  Foreign Scan on public.tbl2  (cost=100.00..149.02 rows=1 width=18)  
         Output: tbl2.id, 'new'::text, tbl2.crt_time, tbl2.ctid  
         Remote SQL: SELECT id, crt_time, ctid FROM public.tbl2 WHERE ((id = 2)) AND ((abs(mod(id, 4)) = 2)) FOR UPDATE  
(10 rows)  

master=# explain (verbose) delete from tbl where id=2 and abs(mod(id, 4))=abs(mod(2, 4));  
                                                QUERY PLAN                                                  
----------------------------------------------------------------------------------------------------------  
 Delete on public.tbl  (cost=0.00..164.62 rows=2 width=6)  
   Delete on public.tbl  
   Foreign Delete on public.tbl2  
     Remote SQL: DELETE FROM public.tbl2 WHERE ctid = $1  
   ->  Seq Scan on public.tbl  (cost=0.00..0.00 rows=1 width=6)  
         Output: tbl.ctid  
         Filter: ((tbl.id = 2) AND (abs(mod(tbl.id, 4)) = 2))  
   ->  Foreign Scan on public.tbl2  (cost=100.00..164.62 rows=1 width=6)  
         Output: tbl2.ctid  
         Remote SQL: SELECT ctid FROM public.tbl2 WHERE ((id = 2)) AND ((abs(mod(id, 4)) = 2)) FOR UPDATE  
(10 rows)  

可以回退外部表SQL:

master=# begin ;  
BEGIN  
master=# insert into tbl0 values (0,'abc',now());  
INSERT 0 1  
master=# rollback;  
ROLLBACK  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
  2 | abc  | 2016-02-23 09:17:55.065332  
(2 rows)  

有跨库事务时,支持全局一致性:

master=# begin;  
BEGIN  
master=# insert into tbl values(3,'new',now());  
INSERT 0 0  
master=# insert into tbl values(1,'new',now());  
ERROR:  duplicate key value violates unique constraint "pk"  
DETAIL:  Key (id)=(1) already exists.  
CONTEXT:  Remote SQL command: INSERT INTO public.tbl1(id, info, crt_time) VALUES ($1, $2, $3)  
SQL statement "insert into tbl1 (id, info, crt_time) values (NEW.*)"  
PL/pgSQL function f_tbl_ins() line 8 at SQL statement  
master=# end;  
ROLLBACK  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
  2 | abc  | 2016-02-23 09:17:55.065332  
(2 rows)  

有跨库事务和本地事务时,支持全局一致性:

master=# create table test(id int primary key, info text);  
CREATE TABLE  
master=# begin;  
BEGIN  
master=# insert into test values (1,'abc');  
INSERT 0 1  
master=# insert into tbl values(3,'new',now());  
INSERT 0 0  
master=# insert into test values (1,'abc');  
ERROR:  duplicate key value violates unique constraint "test_pkey"  
DETAIL:  Key (id)=(1) already exists.  
master=# end;  
ROLLBACK  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
  2 | abc  | 2016-02-23 09:17:55.065332  
(2 rows)  

master=# select * from test;  
 id | info   
----+------  
(0 rows)  

支持绑定变量:

master=# prepare p1 (int,text,timestamp) as insert into tbl values ($1,$2,$3);  
PREPARE  
master=# prepare p2 (int,int) as select * from tbl where id=$1 and abs(mod($1,4))=$2;  
PREPARE  
master=# prepare p3 (int,int,text,timestamp) as update tbl set info=$3,crt_time=$4 where id=$1 and abs(mod($1,4))=$2;  
PREPARE  
master=# execute p1(1,'abc',now());  
ERROR:  duplicate key value violates unique constraint "pk"  
DETAIL:  Key (id)=(1) already exists.  
CONTEXT:  Remote SQL command: INSERT INTO public.tbl1(id, info, crt_time) VALUES ($1, $2, $3)  
SQL statement "insert into tbl1 (id, info, crt_time) values (NEW.*)"  
PL/pgSQL function f_tbl_ins() line 8 at SQL statement  
master=# execute p1(3,'abc',now());  
INSERT 0 0  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
  2 | abc  | 2016-02-23 09:17:55.065332  
  3 | abc  | 2016-02-23 09:56:00.835324  
(3 rows)  

master=# execute p1(4,'abc',now());  
INSERT 0 0  
master=# execute p1(5,'abc',now());  
INSERT 0 0  
master=# execute p1(6,'abc',now());  
INSERT 0 0  
master=# execute p1(7,'abc',now());  
INSERT 0 0  
master=# execute p1(8,'abc',now());  
INSERT 0 0  
master=# execute p1(9,'abc',now());  
INSERT 0 0  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  4 | abc  | 2016-02-23 09:56:20.159337  
  8 | abc  | 2016-02-23 09:56:31.034317  
  1 | abc  | 2016-02-23 09:17:43.054333  
  5 | abc  | 2016-02-23 09:56:24.392312  
  9 | abc  | 2016-02-23 09:56:33.303365  
  2 | abc  | 2016-02-23 09:17:55.065332  
  6 | abc  | 2016-02-23 09:56:26.560318  
  3 | abc  | 2016-02-23 09:56:00.835324  
  7 | abc  | 2016-02-23 09:56:28.740312  
(9 rows)  

master=# execute p2(1,1);  
 id | info |          crt_time            
----+------+----------------------------  
  1 | abc  | 2016-02-23 09:17:43.054333  
(1 row)  

master=# execute p2(10,2);  
 id | info | crt_time   
----+------+----------  
(0 rows)  

master=# execute p2(1,2);  
 id | info | crt_time   
----+------+----------  
(0 rows)  

master=# execute p2(2,2);  
 id | info |          crt_time            
----+------+----------------------------  
  2 | abc  | 2016-02-23 09:17:55.065332  
(1 row)  

master=# execute p3(1,1,'test',now());  
UPDATE 1  
master=# select * from tbl;  
 id | info |          crt_time            
----+------+----------------------------  
  4 | abc  | 2016-02-23 09:56:20.159337  
  8 | abc  | 2016-02-23 09:56:31.034317  
  5 | abc  | 2016-02-23 09:56:24.392312  
  9 | abc  | 2016-02-23 09:56:33.303365  
  1 | test | 2016-02-23 09:57:12.126359  
  2 | abc  | 2016-02-23 09:17:55.065332  
  6 | abc  | 2016-02-23 09:56:26.560318  
  3 | abc  | 2016-02-23 09:56:00.835324  
  7 | abc  | 2016-02-23 09:56:28.740312  
(9 rows)  

使用xmin, xmax验证外部表因为事务中其他SQL或其他节点执行失败导致的事务回滚的情况,确保全局事务一致:

db0=# select tableoid,ctid,xmin,xmax,* from tbl0;  
 tableoid |  ctid  |   xmin   | xmax | id | info |          crt_time            
----------+--------+----------+------+----+------+----------------------------  
    20280 | (0,11) | 38232587 |    0 |  4 | test | 2016-02-23 11:25:20.440349  
    20280 | (0,12) | 38232587 |    0 |  8 | test | 2016-02-23 11:25:20.440349  
(2 rows)  

注意master节点查看到的外部表的xmin, xmax是不准确的,已经提交了这个BUG给社区。

db0=# \c master  
You are now connected to database "master" as user "postgres".  
master=# select tableoid,ctid,xmin,xmax,* from tbl0;  
 tableoid |  ctid  | xmin |    xmax    | id | info |          crt_time            
----------+--------+------+------------+----+------+----------------------------  
    20304 | (0,11) |  192 | 4294967295 |  4 | test | 2016-02-23 11:25:20.440349  
    20304 | (0,12) |  192 | 4294967295 |  8 | test | 2016-02-23 11:25:20.440349  
(2 rows)  

使用以下SQL,在一条SQL中跨库更新多条记录,其中一条涉及到ID从7变成6,会违反数据库分片的表约束,导致SQL失败,看看这种情况下是否能保证全局事务一致性。

master=# update tbl set id=(case id when 4 then 4 when 3 then 3 when 7 then 6 else id end) ;  
ERROR:  new row for relation "tbl3" violates check constraint "ck1"  
DETAIL:  Failing row contains (6, test, 2016-02-23 11:25:20.440349).  
CONTEXT:  Remote SQL command: UPDATE public.tbl3 SET id = $2 WHERE ctid = $1  
master=# select tableoid,ctid,xmin,xmax,* from tbl4;  
ERROR:  relation "tbl4" does not exist  
LINE 1: select tableoid,ctid,xmin,xmax,* from tbl4;  
                                              ^  

查看XID:

master=# select tableoid,ctid,xmin,xmax,* from tbl0;  
 tableoid |  ctid  | xmin |    xmax    | id | info |          crt_time            
----------+--------+------+------------+----+------+----------------------------  
    20304 | (0,11) |  192 | 4294967295 |  4 | test | 2016-02-23 11:25:20.440349  
    20304 | (0,12) |  192 | 4294967295 |  8 | test | 2016-02-23 11:25:20.440349  
(2 rows)  

可以看到xmax值已经变更了,说明发生了回退,从而确保了全局事务的一致性。

master=# \c db0  
You are now connected to database "db0" as user "postgres".  
db0=# select tableoid,ctid,xmin,xmax,* from tbl0;  
 tableoid |  ctid  |   xmin   |   xmax   | id | info |          crt_time            
----------+--------+----------+----------+----+------+----------------------------  
    20280 | (0,11) | 38232587 | 38232588 |  4 | test | 2016-02-23 11:25:20.440349  
    20280 | (0,12) | 38232587 | 38232588 |  8 | test | 2016-02-23 11:25:20.440349  
(2 rows)  

外部表和外部表的跨库JOIN

master=# explain select * from tbl, t where t.id=tbl.id and tbl.id=1 and abs(mod(tbl.id,4))=1;  
                                  QUERY PLAN                                     
-------------------------------------------------------------------------------  
 Nested Loop  (cost=0.00..635.41 rows=50 width=88)  
   ->  Append  (cost=0.00..500.68 rows=25 width=44)  
         ->  Seq Scan on t  (cost=0.00..0.00 rows=1 width=44)  
               Filter: (id = 1)  
         ->  Foreign Scan on db0_t  (cost=100.00..125.17 rows=6 width=44)  
         ->  Foreign Scan on db1_t  (cost=100.00..125.17 rows=6 width=44)  
         ->  Foreign Scan on db2_t  (cost=100.00..125.17 rows=6 width=44)  
         ->  Foreign Scan on db3_t  (cost=100.00..125.17 rows=6 width=44)  
   ->  Materialize  (cost=0.00..134.11 rows=2 width=44)  
         ->  Append  (cost=0.00..134.10 rows=2 width=44)  
               ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=44)  
                     Filter: ((id = 1) AND (abs(mod(id, 4)) = 1))  
               ->  Foreign Scan on tbl1  (cost=100.00..134.10 rows=1 width=44)  
(13 rows)  

外部表,全局表的JOIN

master=# explain select * from tbl, t, test where t.id=tbl.id and test.id=t.id and tbl.id=1 and abs(mod(tbl.id,4))=1;  
                                       QUERY PLAN                                          
-----------------------------------------------------------------------------------------  
 Nested Loop  (cost=0.15..643.60 rows=50 width=124)  
   ->  Append  (cost=0.00..500.68 rows=25 width=44)  
         ->  Seq Scan on t  (cost=0.00..0.00 rows=1 width=44)  
               Filter: (id = 1)  
         ->  Foreign Scan on db0_t  (cost=100.00..125.17 rows=6 width=44)  
         ->  Foreign Scan on db1_t  (cost=100.00..125.17 rows=6 width=44)  
         ->  Foreign Scan on db2_t  (cost=100.00..125.17 rows=6 width=44)  
         ->  Foreign Scan on db3_t  (cost=100.00..125.17 rows=6 width=44)  
   ->  Materialize  (cost=0.15..142.30 rows=2 width=80)  
         ->  Nested Loop  (cost=0.15..142.29 rows=2 width=80)  
               ->  Index Scan using test_pkey on test  (cost=0.15..8.17 rows=1 width=36)  
                     Index Cond: (id = 1)  
               ->  Append  (cost=0.00..134.10 rows=2 width=44)  
                     ->  Seq Scan on tbl  (cost=0.00..0.00 rows=1 width=44)  
                           Filter: ((id = 1) AND (abs(mod(id, 4)) = 1))  
                     ->  Foreign Scan on tbl1  (cost=100.00..134.10 rows=1 width=44)  
(16 rows)  

扩展能力和性能:
master和数据分片都可以水平扩展。
性能可以随着节点数的增加线性提升。
更适合OLTP。OLAP目前还是MPP做得比较好。

如果要下推JOIN,目前的方法是使用数据节点视图,将JOIN封装在视图中,在上层节点建立视图的外部表,通过访问这类外部表来实现JOIN的下推。未来PG会提供JOIN下推的功能,不需要这么麻烦。  


其他  
truncate 目前不支持外部表
JOIN目前不能下推到数据节点执行
条件可以下推到数据节点

PostgreSQL sharding 数据分片 for Oracle, SQL Server, DB2, Sybase

$
0
0

Oracle 12c支持sharding,但是对于低版本,如何实现水平分库呢?
在写PostgreSQL 水平分库方案时,想到一招。何不利用PostgreSQL的分片技术来实现对Oracle的分片呢?
分库技术架构和实践请参考:
http://blog.163.com/digoal@126/blog/static/16387704020161239252998/

如果要做到对Oracle用户完全透明,需要满足几个条件:
1. PostgreSQL必须支持Oracle的SQL语法,这一点 EnterpriseDB 可以满足需求。
2. PostgreSQL必须支持Oracle的存储过程和函数,以及包。这一点 EnterpriseDB 可以满足需求。
如果用户愿意修改不兼容的SQL和函数,使用社区版本的 PostgreSQL 就能满足分片需求了。

分片架构如下: 
可以支持几乎任何数据库的分片。
_2
分为两层:
1. 上层为PostgreSQL 或 EnterpriseDB(如果需要兼容Oracle特殊语法),在上层节点中,需要存储表的定义,路由算法,函数,存储过程,视图,序列等全局数据。
上层的PostgreSQL数据库可以有1个或者多个,最少1个。
2. 下层为数据分片节点,可以是任何数据库品种,譬如图中所述的Oracle, DB2, Sybase, SQL Server。在分片节点中,存储数据分片,维度表(用户可以自定义维度表的副本数)。

注意,如果要支持函数,必须将原数据库的函数转换为PostgreSQL的函数,在PostgreSQL中可以使用plpgsql语言来实现,包括自治事务也能实现
(参考 http://blog.163.com/digoal@126/blog/static/163877040201613982063/ )。
如果使用EnterpriseDB,则大多数的Oracle函数语法都兼容,用户可以不需要修改,直接使用。

以Oracle为例,介绍实施步骤:
.1. 安装Oracle数据节点,这里假设有4台Oracle数据库,分别为db0,db1,db2,db3。
.2. 安装一台PostgreSQL 9.5+ 以及 oracle_fdw插件。
插件位置:http://pgxn.org/dist/oracle_fdw/
内含详细说明,推荐阅读。
http://blog.163.com/digoal@126/blog/static/163877040201181505331588/
安装好后,设置正确的 NLS_LANG 环境变量(_. (for example AMERICAN_AMERICA.AL32UTF8)),重启数据库。
.3. 配置oracle数据库监听,以及主机防火墙,允许PostgreSQL数据库访问Oracle数据库。
.4. 在PostgreSQL数据库中创建所有数据节点的foreign server, 本例需要4个foreign server, user mapping。
例如 (请使用正确的 IP,端口和sid, username, password替换) :

master=# create extension oracle_fdw;    
master=# create server db0 foreign data wrapper oracle_fdw OPTIONS (dbserver '//ip:port/sid');    
master=# create server db0 foreign data wrapper oracle_fdw OPTIONS (dbserver '//ip:port/sid');    
master=# create server db0 foreign data wrapper oracle_fdw OPTIONS (dbserver '//ip:port/sid');    
master=# create server db0 foreign data wrapper oracle_fdw OPTIONS (dbserver '//ip:port/sid');    
master=# create user mapping for postgres server db0 options (user 'username', password 'pwd');    
master=# create user mapping for postgres server db1 options (user 'username', password 'pwd');    
master=# create user mapping for postgres server db2 options (user 'username', password 'pwd');    
master=# create user mapping for postgres server db3 options (user 'username', password 'pwd');    

.5. 规划表分区的分布列,如果分布列不是INT类型,可以使用hash函数转换为INT。按abs(mod(column,4))的值计算分布规则。
.6. 在所有的数据节点db[0-3],创建需要分片的表,以及分布列的 check 约束。
例如:

on db0:  
create table tbl ( id int primary key , info varchar2(32), crt_time date, check (abs(mod(id,4))=0));  
on db1:  
create table tbl ( id int primary key , info varchar2(32), crt_time date, check (abs(mod(id,4))=1));  
on db2:  
create table tbl ( id int primary key , info varchar2(32), crt_time date, check (abs(mod(id,4))=2));  
on db3:  
create table tbl ( id int primary key , info varchar2(32), crt_time date, check (abs(mod(id,4))=3));  

.7. 规划维度表的副本数,本文例子假设维度表有2个副本,分别放在db0, db1。
.8. 在数据节点db0, db1创建维度表。
例如:

on db0:  
create table test ( id int primary key, info varchar2(32), crt_time date);  
on db1:  
create table test ( id int primary key, info varchar2(32), crt_time date);  

.9. 在PostgreSQL节点,创建分片表的外部表,必须包含CHECN约束。必须制定KEY,否则不能写。

create FOREIGN table tbl0 (id int OPTIONS (key 'true') , info varchar(32), crt_time timestamp without time zone) server db0 options (table 'tbl', schema 'username');  
create FOREIGN table tbl1 (id int OPTIONS (key 'true') , info varchar(32), crt_time timestamp without time zone) server db1 options (table 'tbl', schema 'username');  
create FOREIGN table tbl2 (id int OPTIONS (key 'true') , info varchar(32), crt_time timestamp without time zone) server db2 options (table 'tbl', schema 'username');  
create FOREIGN table tbl3 (id int OPTIONS (key 'true') , info varchar(32), crt_time timestamp without time zone) server db3 options (table 'tbl', schema 'username');  
alter foreign table tbl0 add constraint ck_tbl0 check (abs(mod(id,4))=0);    
alter foreign table tbl1 add constraint ck_tbl1 check (abs(mod(id,4))=1);    
alter foreign table tbl2 add constraint ck_tbl2 check (abs(mod(id,4))=2);    
alter foreign table tbl3 add constraint ck_tbl3 check (abs(mod(id,4))=3);    

.10. 在PostgreSQL节点,创建维度表的外部表

 create FOREIGN table test0 (id int OPTIONS (key 'true'), info varchar(32), crt_time timestamp without time zone) server db0 options (table 'test', schema 'username');  
 create FOREIGN table test1 (id int OPTIONS (key 'true'), info varchar(32), crt_time timestamp without time zone) server db1 options (table 'test', schema 'username');  

.11. 在PostgreSQL节点,创建分片表的父表,设置继承关系,触发器函数,触发器。

create table tbl (id int primary key, info varchar(32), crt_time timestamp without time zone);  
alter foreign table tbl0 inherit tbl;    
alter foreign table tbl1 inherit tbl;    
alter foreign table tbl2 inherit tbl;    
alter foreign table tbl3 inherit tbl;    
create or replace function f_tbl_ins() returns trigger as $$    
declare    
begin    
  case abs(mod(NEW.id, 4))     
    when 0 then    
      insert into tbl0 (id, info, crt_time) values (NEW.*);    
    when 1 then    
      insert into tbl1 (id, info, crt_time) values (NEW.*);    
    when 2 then    
      insert into tbl2 (id, info, crt_time) values (NEW.*);    
    when 3 then    
      insert into tbl3 (id, info, crt_time) values (NEW.*);    
    else    
      return null;    
  end case;    
    return null;    
end;    
$$ language plpgsql;    
create trigger tg1 before insert on tbl for each row execute procedure f_tbl_ins();    

.12. 在PostgreSQL节点,创建维度表的父表,设置继承关系,触发器函数,触发器。

create table test (id int primary key, info varchar(32), crt_time timestamp without time zone);  
alter foreign table test0 inherit test;    
-- 在不同的master节点,设置不同的继承,从而实现均衡查询的目的,目前PG内核还不支持维度表的负载均衡。  
create or replace function f_test_iud() returns trigger as $$    
declare    
begin    
  case TG_OP  
    when 'INSERT' then  
      insert into test0 (id, info, crt_time) values (NEW.*);    
      insert into test1 (id, info, crt_time) values (NEW.*);    
    when 'UPDATE' then  
      update test0 set id=NEW.id,info=NEW.info,crt_time=NEW.crt_time where id=OLD.id and info=OLD.info and crt_time=OLD.crt_time;  
      update test1 set id=NEW.id,info=NEW.info,crt_time=NEW.crt_time where id=OLD.id and info=OLD.info and crt_time=OLD.crt_time;  
    when 'DELETE' then  
      delete from test0 where id=OLD.id and info=OLD.info and crt_time=OLD.crt_time;  
      delete from test1 where id=OLD.id and info=OLD.info and crt_time=OLD.crt_time;  
  end case;  
    return null;    
end;    
$$ language plpgsql;    
create trigger tg1 before insert or update or delete on test for each row execute procedure f_test_iud();    

现在,你可以测试这些表的插入,查询,更新,删除,JOIN。以及分布式事务。
插入tbl这个分片表时,会根据ID计算一个模值,插入到对应的分片节点。
更新,删除,查询时,如果提供了ID的模值,则会选择对应的子节点查询。
对于维度表test,查询时会自动查询test0, 更新,删除,插入则会在test0,test1同时操作 (非并行)。

使用这种方法给其他数据库做sharding, 除了EDB对Oracle兼容性比较好,其他的兼容性都需要用户去验证。
但是不管怎么样,用户可以获得如下好处:
ACID
分布式事务
跨库JOIN
主节点和数据节点都支持水平扩展
prepared statement
支持存储过程和函数

PostgreSQL 优化器逻辑推理能力 源码解析

$
0
0
之前讲过优化器和操作符的暧昧关系
今天来谈一谈优化器的逻辑推理能力。
数据库优化器需要具备逻辑推理能力,而且越强越好,为什么呢?
举一些例子,
通过已知的一个人讲的是真话,推理出另一个人讲的一定是真话或一定是假话。
例子1:
    假设预先提供了 a > 10 是真话
    可以推理出 a < 1 一定是假话
例子2:
    假设预先提供了 a > 10 是真话
    无法推理出 a < 100 一定是真话或假话
例子3:
    假设预先提供了 a 是空 是真话
    可以推理出 a 不是空 一定是假话
例子4:
    假设预先提供了 a <>100 是真话
    可以推理出 a =100 一定是假话
例子5:
    假设预先提供了 a >100 是真话
    可以推理出 a >1 一定是真话
例子6:
    假设预先提供了 a 的坐标位置在中国 是真话
    可以推理出 a 的坐标位置在浙江杭州 一定是真话
例子7:
    假设预先提供了 平面中 坐标A和坐标(1,100)的距离小于100 是真话
    是否推理出 坐标A和坐标(100,100)的距离小于1000 一定是真话或假话?

总结一下以上逻辑推理,首先要提供已知真假的一个表达式,然后推理另一个表达式的真假。推理可以得出的结论是真、或者假、或者不知道真假。
对于推理出来的结果一定是真或者一定是假的情况,数据库可以利用它来减少后期的处理。
这体现在优化器生成查询树之前。例如:
create table tab(id int check (id >=0), info text, crt_time timestamp);
select * from tab where id<0;
以上已知为真的表达式是id>=0,通过这个表达式能推理出SQL中给出的表达式 id<0 一定是假。那么优化器在执行这条SQL时,可以省去扫描表然后再过滤id<0的行,而是构造结构,并直接返回0条记录。
我们看看执行计划:
digoal=# create table ta(id int check (id >=0), info text, crt_time timestamp);
CREATE TABLE
digoal=# explain select * from ta where id=-1;
                     QUERY PLAN                     
----------------------------------------------------
 Seq Scan on ta  (cost=0.00..24.12 rows=6 width=44)
   Filter: (id = '-1'::integer)
(2 rows)
以上查询貌似并没有优化,还是扫描了表,原因是constraint_exclusion参数默认值对UNION ALL和分区表开启这种逻辑推理检查。
constraint_exclusion 改为ON即可对所有表进行逻辑推理检查。
digoal=# set constraint_exclusion =on;
SET
digoal=# explain select * from ta where id=-1;  -- 现在不需要扫描表了
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)
digoal=# explain select * from ta where id<-1;  -- 现在不需要扫描表了
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)
对于无法推理出一定为假的条件,还是需要扫描表的,例如 id<>0。
postgres=# explain select * from ta where id<>0;
                      QUERY PLAN                       
-------------------------------------------------------
 Seq Scan on ta  (cost=0.00..24.12 rows=1124 width=44)
   Filter: (id <> 0)
(2 rows)
对于提供的表达式与已知的表达式操作符左侧不一致的,目前PG的优化器没有做到这么智能,例如 id+1<10,id+1<0,优化器不会对这种表达式进行逻辑推理,后面我会在代码中分析这块。
postgres=# explain select * from ta where id+1<10;
                      QUERY PLAN                      
------------------------------------------------------
 Seq Scan on ta  (cost=0.00..26.95 rows=377 width=44)
   Filter: ((id + 1) < 10)
(2 rows)
postgres=# explain select * from ta where id+1<0;
                      QUERY PLAN                      
------------------------------------------------------
 Seq Scan on ta  (cost=0.00..26.95 rows=377 width=44)
   Filter: ((id + 1) < 0)
(2 rows)
 id+1<0 是可以转换为 id< 0-1的 ,对于以下表达式,PG进行了推理,原因是-操作符是一个immutable操作符,0-1可以转为常数-1从而可以进行推理。
postgres=# explain select * from ta where id<0-1;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)

目前PostgreSQL数据库支持哪些逻辑推理呢?
1. 约束中包含的表达式的操作符必须是B-tree-indexable operators(或者is null, or , is not null),也就是可以被btree索引用于检索操作符,例如<,<=,=,>,>=以及<>  (<>不能直接被索引使用,但是可以转换为< OR >来使用索引)。
B-tree-indexable operators
2. SQL语句where字句中提供的表达式,同样操作符必须是B-tree-indexable operators。
3. SQL语句where字句中提供的表达式,操作符左侧的操作数必须与约束中的操作数完全一致。
例如约束为(check mod(id,4) = 0),SQL where字句提供的表达式则必须为 mod(id,4) op?   ?  这种形式才会进行推理。
又如约束为(check id*100 > 1000),SQL where字句提供的表达式则必须为 id*100 op?   ?  这种形式才会进行推理。
又如约束为(check id+10 between 1000 and 10000),SQL where字句提供的表达式则必须为 id+10 op?   ?  这种形式才会进行推理。( PostgreSQL 的 between and 会转换为>= and <=,属于B-tree-indexable operators )
又如约束为(check id between 1000 and 10000),SQL where字句提供的表达式则必须为 id  op?   ?  这种形式才会进行推理。

例子:
约束为is [not] null类型
postgres=# create table tt1(id int check (id is null));
CREATE TABLE
postgres=# explain select * from tt1 where id=1;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)
postgres=# explain select * from tt1 where id is null;
                     QUERY PLAN                      
-----------------------------------------------------
 Seq Scan on tt1  (cost=0.00..35.50 rows=13 width=4)
   Filter: (id IS NULL)
(2 rows)
postgres=# explain select * from tt1 where id is not null;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)

约束为 mod(id,4) = 0,=为B-tree-indexable operators
postgres=# create table tt2( id int check(mod(id,4) = 0));
CREATE TABLE
postgres=# explain select * from tt2 where id=1;
                     QUERY PLAN                      
-----------------------------------------------------
 Seq Scan on tt2  (cost=0.00..41.88 rows=13 width=4)
   Filter: (id = 1)
(2 rows)
-- 要让PG进行逻辑推理,WHERE中必须包含mod(id,4)表达式,并且由于mod是immutable函数,mod(1,4)可以转换为常数,因此以下SQL相当于
explain select * from tt2 where mod(id,4)=1 and id=1; 这样才可以被逻辑推理。
postgres=# explain select * from tt2 where mod(id,4)=mod(1,4) and id=1;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)

约束为 id*100 > 1000,>为B-tree-indexable operators
postgres=# create table tt3( id int check(id*100 > 1000));
CREATE TABLE
postgres=# explain select * from tt3 where id=1;
                     QUERY PLAN                      
-----------------------------------------------------
 Seq Scan on tt3  (cost=0.00..41.88 rows=13 width=4)
   Filter: (id = 1)
(2 rows)
-- 要让PG进行逻辑推理,WHERE中必须包含id*100表达式,并且*是immutable操作符,所以1*100可以替换为常数。从而进行逻辑推理。
postgres=# explain select * from tt3 where id=1 and id*100=1*100;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)

约束为 id+10 between 1000 and 10000,between and 自动转换为>=和and <=。并且WHERE中必须包含id+10表达式,同时>=或<=是B-tree-indexable operators。
postgres=# create table tt4( id int check(id+10 between 1000 and 10000));
CREATE TABLE
postgres=# explain select * from tt4 where id=1;
                     QUERY PLAN                      
-----------------------------------------------------
 Seq Scan on tt4  (cost=0.00..41.88 rows=13 width=4)
   Filter: (id = 1)
(2 rows)
postgres=# explain select * from tt4 where id=1 and id+10=1+10;  -- +是immutable操作符1+10将转换为11常数。
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)

约束为 check id between 1000 and 10000
postgres=# create table tt5( id int check(id between 1000 and 10000));
CREATE TABLE
postgres=# explain select * from tt5 where id=1;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)
postgres=# explain select * from tt5 where id+1=1;
                     QUERY PLAN                      
-----------------------------------------------------
 Seq Scan on tt5  (cost=0.00..48.25 rows=13 width=4)
   Filter: ((id + 1) = 1)
(2 rows)
postgres=# explain select * from tt5 where 1=id;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)
postgres=# explain select * from tt5 where 1>id;
                QUERY PLAN                
------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0)
   One-Time Filter: false
(2 rows)
postgres=# explain select * from tt5 where 1<id;
                      QUERY PLAN                      
------------------------------------------------------
 Seq Scan on tt5  (cost=0.00..41.88 rows=850 width=4)
   Filter: (1 < id)
(2 rows)

PostgreSQL数据库是如何实现这些逻辑推理的呢?
上面的例子,都转换成了?1 op ?2,其中 ?1 是一个表达式或字段,?2是一个常数。
但是,数据库是怎么通过一个条件的真伪判断另一个条件的真伪呢?
还是回到一个例子:
check id > 100
推理 id > 1 是真是假?可以通过比较两个常数来决定,100 >= 1 为真则说明 id>1为真。
为什么要比较这两个常数呢?因为这是优化器排除对表的扫描的一种手段,这时还没有到需要用到id值的阶段。所以此时优化器只能通过常数来推理。
具体的代码如下:
目前PG只实现了对btree索引可以用到的操作符的逻辑推理,使用了两张映射表来描述推理关系。
一张表BT_implic_table 用来推理一定为真,另一张表BT_refute_table 用来推理一定为假。
例如:
已知 ATTR given_op CONST1 为真
如果 CONST2 test_op CONST1 为真
则推理得出 ATTR target_op CONST2 一定为真
其中 test_op = BT_implic_table[given_op-1][target_op-1] 就是通过BT_implic_table 映射表取出的操作符。

已知 ATTR given_op CONST1 为真
如果 CONST2 test_op CONST1 为假
则推理得出 ATTR target_op CONST2 一定为假
其中 test_op = BT_refute_table[given_op-1][target_op-1] 就是通过BT_refute_table 映射表取出的操作符。

/*
 * Define an "operator implication table" for btree operators ("strategies"),
 * and a similar table for refutation.
 *
 * The strategy numbers defined by btree indexes (see access/skey.h) are:
 * (1) < (2) <= (3) = (4) >=   (5) >
 * and in addition we use (6) to represent <>.  <> is not a btree-indexable
 * operator, but we assume here that if an equality operator of a btree
 * opfamily has a negator operator, the negator behaves as <> for the opfamily.
 * (This convention is also known to get_op_btree_interpretation().)
 *
 * The interpretation of:
 *
 * test_op = BT_implic_table[given_op-1][target_op-1]
 *
 * where test_op, given_op and target_op are strategy numbers (from 1 to 6)
 * of btree operators, is as follows:
 *
 * If you know, for some ATTR, that "ATTR given_op CONST1" is true, and you
 * want to determine whether "ATTR target_op CONST2" must also be true, then
 * you can use "CONST2 test_op CONST1" as a test.  If this test returns true,
 * then the target expression must be true; if the test returns false, then
 * the target expression may be false.
 *
 * For example, if clause is "Quantity > 10" and pred is "Quantity > 5"
 * then we test "5 <= 10" which evals to true, so clause implies pred.
 *
 * Similarly, the interpretation of a BT_refute_table entry is:
 *
 * If you know, for some ATTR, that "ATTR given_op CONST1" is true, and you
 * want to determine whether "ATTR target_op CONST2" must be false, then
 * you can use "CONST2 test_op CONST1" as a test.  If this test returns true,
 * then the target expression must be false; if the test returns false, then
 * the target expression may be true.
 *
 * For example, if clause is "Quantity > 10" and pred is "Quantity < 5"
 * then we test "5 <= 10" which evals to true, so clause refutes pred.
 *
 * An entry where test_op == 0 means the implication cannot be determined.
 */

#define BTLT BTLessStrategyNumber
#define BTLE BTLessEqualStrategyNumber
#define BTEQ BTEqualStrategyNumber
#define BTGE BTGreaterEqualStrategyNumber
#define BTGT BTGreaterStrategyNumber
#define BTNE ROWCOMPARE_NE

static const StrategyNumber BT_implic_table[6][6] = {
/*
 * The target operator:
 *
 * LT    LE EQ    GE GT    NE
 */
{BTGE, BTGE, 0, 0, 0, BTGE}, /* LT */
{BTGT, BTGE, 0, 0, 0, BTGT}, /* LE */
{BTGT, BTGE, BTEQ, BTLE, BTLT, BTNE}, /* EQ */
{0, 0, 0, BTLE, BTLT, BTLT}, /* GE */
{0, 0, 0, BTLE, BTLE, BTLE}, /* GT */
{0, 0, 0, 0, 0, BTEQ} /* NE */
};

static const StrategyNumber BT_refute_table[6][6] = {
/*
 * The target operator:
 *
 * LT    LE EQ    GE GT    NE
 */
{0, 0, BTGE, BTGE, BTGE, 0}, /* LT */
{0, 0, BTGT, BTGT, BTGE, 0}, /* LE */
{BTLE, BTLT, BTNE, BTGT, BTGE, BTEQ}, /* EQ */
{BTLE, BTLT, BTLT, 0, 0, 0}, /* GE */
{BTLE, BTLE, BTLE, 0, 0, 0}, /* GT */
{0, 0, BTEQ, 0, 0, 0} /* NE */
};
这两个表里面的0,表示无法推断真或假的情况。例如通过 a>100 无法推断 a>? 一定为假, 只能推断 a>? 一定为真。
通过100, ?, 以及 test_op 来推断,而test_op就是从BT_implic_table表中取出的BTLE即<=,因此判断的依据是 ? <= 100 为真则a>? 一定为真。

PostgreSQL通过get_btree_test_op 获得test_op,代码如下:
get_btree_test_op
/*
 * Look up the "test" strategy number in the implication table
 */
if (refute_it)
test_strategy = BT_refute_table[clause_strategy - 1][pred_strategy - 1];
else
test_strategy = BT_implic_table[clause_strategy - 1][pred_strategy - 1];

if (test_strategy == 0)
{
/* Can't determine implication using this interpretation */
continue;
}
/*
 * See if opfamily has an operator for the test strategy and the
 * datatypes.
 */
if (test_strategy == BTNE)
{
test_op = get_opfamily_member(opfamily_id,
  pred_op_info->oprighttype,
  clause_op_info->oprighttype,
  BTEqualStrategyNumber);
if (OidIsValid(test_op))
test_op = get_negator(test_op);
}
else
{
test_op = get_opfamily_member(opfamily_id,
  pred_op_info->oprighttype,
  clause_op_info->oprighttype,
  test_strategy);
}

if (!OidIsValid(test_op))
continue;
...
return test_op;

那么PostgreSQL可以利用这些逻辑推理来做什么呢?
通过推断 "一定为假" 来排除哪些表不需要参与到执行计划。直接排除掉。
PostgreSQL 优化器逻辑推理能力 源码解析 - 德哥@Digoal - PostgreSQL research

通过推断 “一定对真” ,可以用在建立执行计划的过程中。
PostgreSQL 优化器逻辑推理能力 源码解析 - 德哥@Digoal - PostgreSQL research

以一定为假为例,我们看看PostgreSQL优化器如何排除哪些表是不需要参与执行计划的。
constraint_exclusion参数控制的逻辑推理应用,可以看到调用栈如下:

relation_excluded_by_constraints 返回 true 表示不需要扫描这个表,返回 false 表示需要扫描这个表。
简单分析一下这个函数的代码:
未开启constraint_exclusion时,不进行逻辑推理。
/* Skip the test if constraint exclusion is disabled for the rel */
if (constraint_exclusion == CONSTRAINT_EXCLUSION_OFF ||
(constraint_exclusion == CONSTRAINT_EXCLUSION_PARTITION &&
!(rel->reloptkind == RELOPT_OTHER_MEMBER_REL ||
  (root->hasInheritedTarget &&
rel->reloptkind == RELOPT_BASEREL &&
rel->relid == root->parse->resultRelation))))
return false;

在检查表自身的约束和SQL提供的where条件前,先检查where 条件是否有自相矛盾的。例如:
 id <> mod(4,3) and id = mod(4,3)
postgres=# \d+ tt11
                         Table "public.tt11"
 Column |  Type   | Modifiers | Storage | Stats target | Description 
--------+---------+-----------+---------+--------------+-------------
 id     | integer |           | plain   |              | 

postgres=# explain (analyze,verbose) select * from tt11 where id<>mod(4,3) and id=mod(4,3);
                                     QUERY PLAN                                     
------------------------------------------------------------------------------------
 Result  (cost=0.00..0.01 rows=1 width=0) (actual time=0.001..0.001 rows=0 loops=1)
   Output: id
   One-Time Filter: false
 Planning time: 0.051 ms
 Execution time: 0.012 ms
(5 rows)

代码如下
/*
* Check for self-contradictory restriction clauses.  We dare not make
* deductions with non-immutable functions, but any immutable clauses that
* are self-contradictory allow us to conclude the scan is unnecessary.
*
* Note: strip off RestrictInfo because predicate_refuted_by() isn't
* expecting to see any in its predicate argument.
*/
safe_restrictions = NIL;
foreach(lc, rel->baserestrictinfo)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);

if (!contain_mutable_functions((Node *) rinfo->clause))
safe_restrictions = lappend(safe_restrictions, rinfo->clause);
}

if (predicate_refuted_by(safe_restrictions, safe_restrictions))
return true;

从SQL涉及的表,以及继承表中获取约束
/* Only plain relations have constraints */
if (rte->rtekind != RTE_RELATION || rte->inh)
return false;

/*
* OK to fetch the constraint expressions.  Include "col IS NOT NULL"
* expressions for attnotnull columns, in case we can refute those.
*/
constraint_pred = get_relation_constraints(root, rte->relid, rel, true);

/*
* We do not currently enforce that CHECK constraints contain only
* immutable functions, so it's necessary to check here. We daren't draw
* conclusions from plan-time evaluation of non-immutable functions. Since
* they're ANDed, we can just ignore any mutable constraints in the list,
* and reason about the rest.
*/
safe_constraints = NIL;
foreach(lc, constraint_pred)
{
Node   *pred = (Node *) lfirst(lc);
                         // 包含非immutable函数的表达式不加入推理判断,因为非immutable函数存在变数,不能转常量
if (!contain_mutable_functions(pred))  
safe_constraints = lappend(safe_constraints, pred);
}

/*
* The constraints are effectively ANDed together, so we can just try to
* refute the entire collection at once.  This may allow us to make proofs
* that would fail if we took them individually.
*
* Note: we use rel->baserestrictinfo, not safe_restrictions as might seem
* an obvious optimization.  Some of the clauses might be OR clauses that
* have volatile and nonvolatile subclauses, and it's OK to make
* deductions with the nonvolatile parts.
*/       
                        //   检测是否一定为假,如果一定为假,则不需要扫描这个表。
if (predicate_refuted_by(safe_constraints, rel->baserestrictinfo))
return true;
调用栈如下:
predicate_refuted_by
predicate_refuted_by_recurse
predicate_refuted_by_simple_clause
       return btree_predicate_proof(predicate, clause, true)
btree_predicate_proof@src/backend/optimizer/util/predtest.c
/*
* Lookup the comparison operator using the system catalogs and the
* operator implication tables.
*/
test_op = get_btree_test_op(pred_op, clause_op, refute_it);

如何加强优化器的推理能力?
目前PostgreSQL仅仅支持有限操作符的逻辑推理,这些操作符必须是btree-indexable operator
postgres=# select oprname,oprcode from pg_operator where oid in (select amopopr from pg_amop where amopmethod=(select oid from pg_am where amname='btree'));
 oprname |         oprcode          
---------+--------------------------
 =       | int48eq
 <       | int48lt
 >       | int48gt
 <=      | int48le
 >=      | int48ge
 <       | boollt
 >       | boolgt
 =       | booleq
 <=      | boolle
 >=      | boolge
 =       | chareq
 =       | nameeq
 =       | int2eq
 <       | int2lt
 =       | int4eq
 <       | int4lt
 =       | texteq
 =       | tideq
 <       | tidlt
 >       | tidgt
 <=      | tidle
 >=      | tidge
 =       | int8eq
 <       | int8lt
 >       | int8gt
 <=      | int8le
 >=      | int8ge
 =       | int84eq
 <       | int84lt
 >       | int84gt
 <=      | int84le
 >=      | int84ge
 >       | int2gt
 >       | int4gt
 <=      | int2le
 <=      | int4le
 >=      | int2ge
 >=      | int4ge
 =       | int24eq
 =       | int42eq
 <       | int24lt
 <       | int42lt
 >       | int24gt
 >       | int42gt
 <=      | int24le
 <=      | int42le
 >=      | int24ge
 >=      | int42ge
 =       | abstimeeq
 <       | abstimelt
 >       | abstimegt
 <=      | abstimele
 >=      | abstimege
 =       | reltimeeq
 <       | reltimelt
 >       | reltimegt
 <=      | reltimele
 >=      | reltimege
 =       | oideq
 <       | oidlt
 >       | oidgt
 <=      | oidle
 >=      | oidge
 <       | oidvectorlt
 >       | oidvectorgt
 <=      | oidvectorle
 >=      | oidvectorge
 =       | oidvectoreq
 =       | float4eq
 <       | float4lt
 >       | float4gt
 <=      | float4le
 >=      | float4ge
 <       | charlt
 <=      | charle
 >       | chargt
 >=      | charge
 <       | namelt
 <=      | namele
 >       | namegt
 >=      | namege
 <       | text_lt
 <=      | text_le
 >       | text_gt
 >=      | text_ge
 =       | float8eq
 <       | float8lt
 <=      | float8le
 >       | float8gt
 >=      | float8ge
 =       | tintervaleq
 <       | tintervallt
 >       | tintervalgt
 <=      | tintervalle
 >=      | tintervalge
 =       | cash_eq
 <       | cash_lt
 >       | cash_gt
 <=      | cash_le
 >=      | cash_ge
 =       | bpchareq
 <       | bpcharlt
 <=      | bpcharle
 >       | bpchargt
 >=      | bpcharge
 =       | array_eq
 <       | array_lt
 >       | array_gt
 <=      | array_le
 >=      | array_ge
 =       | date_eq
 <       | date_lt
 <=      | date_le
 >       | date_gt
 >=      | date_ge
 =       | time_eq
 <       | time_lt
 <=      | time_le
 >       | time_gt
 >=      | time_ge
 =       | timetz_eq
 <       | timetz_lt
 <=      | timetz_le
 >       | timetz_gt
 >=      | timetz_ge
 =       | float48eq
 <       | float48lt
 >       | float48gt
 <=      | float48le
 >=      | float48ge
 =       | float84eq
 <       | float84lt
 >       | float84gt
 <=      | float84le
 >=      | float84ge
 =       | timestamptz_eq
 <       | timestamptz_lt
 <=      | timestamptz_le
 >       | timestamptz_gt
 >=      | timestamptz_ge
 =       | interval_eq
 <       | interval_lt
 <=      | interval_le
 >       | interval_gt
 >=      | interval_ge
 =       | macaddr_eq
 <       | macaddr_lt
 <=      | macaddr_le
 >       | macaddr_gt
 >=      | macaddr_ge
 =       | network_eq
 <       | network_lt
 <=      | network_le
 >       | network_gt
 >=      | network_ge
 =       | numeric_eq
 <       | numeric_lt
 <=      | numeric_le
 >       | numeric_gt
 >=      | numeric_ge
 =       | biteq
 <       | bitlt
 >       | bitgt
 <=      | bitle
 >=      | bitge
 =       | varbiteq
 <       | varbitlt
 >       | varbitgt
 <=      | varbitle
 >=      | varbitge
 =       | int28eq
 <       | int28lt
 >       | int28gt
 <=      | int28le
 >=      | int28ge
 =       | int82eq
 <       | int82lt
 >       | int82gt
 <=      | int82le
 >=      | int82ge
 =       | byteaeq
 <       | bytealt
 <=      | byteale
 >       | byteagt
 >=      | byteage
 =       | timestamp_eq
 <       | timestamp_lt
 <=      | timestamp_le
 >       | timestamp_gt
 >=      | timestamp_ge
 ~<~     | text_pattern_lt
 ~<=~    | text_pattern_le
 ~>=~    | text_pattern_ge
 ~>~     | text_pattern_gt
 ~<~     | bpchar_pattern_lt
 ~<=~    | bpchar_pattern_le
 ~>=~    | bpchar_pattern_ge
 ~>~     | bpchar_pattern_gt
 <       | date_lt_timestamp
 <=      | date_le_timestamp
 =       | date_eq_timestamp
 >=      | date_ge_timestamp
 >       | date_gt_timestamp
 <       | date_lt_timestamptz
 <=      | date_le_timestamptz
 =       | date_eq_timestamptz
 >=      | date_ge_timestamptz
 >       | date_gt_timestamptz
 <       | timestamp_lt_date
 <=      | timestamp_le_date
 =       | timestamp_eq_date
 >=      | timestamp_ge_date
 >       | timestamp_gt_date
 <       | timestamptz_lt_date
 <=      | timestamptz_le_date
 =       | timestamptz_eq_date
 >=      | timestamptz_ge_date
 >       | timestamptz_gt_date
 <       | timestamp_lt_timestamptz
 <=      | timestamp_le_timestamptz
 =       | timestamp_eq_timestamptz
 >=      | timestamp_ge_timestamptz
 >       | timestamp_gt_timestamptz
 <       | timestamptz_lt_timestamp
 <=      | timestamptz_le_timestamp
 =       | timestamptz_eq_timestamp
 >=      | timestamptz_ge_timestamp
 >       | timestamptz_gt_timestamp
 =       | uuid_eq
 <       | uuid_lt
 >       | uuid_gt
 <=      | uuid_le
 >=      | uuid_ge
 =       | pg_lsn_eq
 <       | pg_lsn_lt
 >       | pg_lsn_gt
 <=      | pg_lsn_le
 >=      | pg_lsn_ge
 =       | enum_eq
 <       | enum_lt
 >       | enum_gt
 <=      | enum_le
 >=      | enum_ge
 <       | tsvector_lt
 <=      | tsvector_le
 =       | tsvector_eq
 >=      | tsvector_ge
 >       | tsvector_gt
 <       | tsquery_lt
 <=      | tsquery_le
 =       | tsquery_eq
 >=      | tsquery_ge
 >       | tsquery_gt
 =       | record_eq
 <       | record_lt
 >       | record_gt
 <=      | record_le
 >=      | record_ge
 *=      | record_image_eq
 *<      | record_image_lt
 *>      | record_image_gt
 *<=     | record_image_le
 *>=     | record_image_ge
 =       | range_eq
 <       | range_lt
 <=      | range_le
 >=      | range_ge
 >       | range_gt
 =       | jsonb_eq
 <       | jsonb_lt
 >       | jsonb_gt
 <=      | jsonb_le
 >=      | jsonb_ge
(273 rows)
除此以外的操作符,不参与逻辑推理。
例如
我们知道geo严格在坐标10,0的左边,肯定能推理出它不可能在11,0的右边,正常情况下是可以排除对这个表的扫描的。
但是由于<<,>>不是btree operator,所以不参与推理。
postgres=# create table tt13(id int, geo point check(geo << point '(10,0)'));
CREATE TABLE
postgres=# explain select * from tt13 where geo >> point '(11,0)';
                       QUERY PLAN                       
--------------------------------------------------------
 Seq Scan on tt13  (cost=0.00..31.25 rows=170 width=20)
   Filter: (geo >> '(11,0)'::point)
(2 rows)

这种逻辑推理在分区表的应用中尤为突出,例如:
用户规划了一批分区表,按照ID取模分区。
postgres=# create table p(id int, info text);
CREATE TABLE
postgres=# create table t0(id int check(abs(mod(id,4))=0), info text);
CREATE TABLE
postgres=# create table t1(id int check(abs(mod(id,4))=1), info text);
CREATE TABLE
postgres=# create table t2(id int check(abs(mod(id,4))=2), info text);
CREATE TABLE
postgres=# create table t3(id int check(abs(mod(id,4))=3), info text);
CREATE TABLE
postgres=# alter table t0 inherit p;
ALTER TABLE
postgres=# alter table t1 inherit p;
ALTER TABLE
postgres=# alter table t2 inherit p;
ALTER TABLE
postgres=# alter table t3 inherit p;
ALTER TABLE
postgres=# explain select * from p where id=0;  -- id=0 和 abs(mod(id,4)) =  0,1,2,3由于操作数不一致,不会进行推理。
                        QUERY PLAN                        
----------------------------------------------------------
 Append  (cost=0.00..103.50 rows=25 width=36)
   ->  Seq Scan on p  (cost=0.00..0.00 rows=1 width=36)
         Filter: (id = 0)
   ->  Seq Scan on t0  (cost=0.00..25.88 rows=6 width=36)
         Filter: (id = 0)
   ->  Seq Scan on t1  (cost=0.00..25.88 rows=6 width=36)
         Filter: (id = 0)
   ->  Seq Scan on t2  (cost=0.00..25.88 rows=6 width=36)
         Filter: (id = 0)
   ->  Seq Scan on t3  (cost=0.00..25.88 rows=6 width=36)
         Filter: (id = 0)
(11 rows)
postgres=# explain select * from p where id=0 and abs(mod(id,4)) = abs(mod(0,4));   -- 所以必须带上与约束一致的操作数
                        QUERY PLAN                        
----------------------------------------------------------
 Append  (cost=0.00..35.40 rows=2 width=36)
   ->  Seq Scan on p  (cost=0.00..0.00 rows=1 width=36)
         Filter: ((id = 0) AND (abs(mod(id, 4)) = 0))
   ->  Seq Scan on t0  (cost=0.00..35.40 rows=1 width=36)
         Filter: ((id = 0) AND (abs(mod(id, 4)) = 0))
(5 rows)
如果我们使用的是范围分区,就不存在以上的问题。因为操作数可以做到一致。

从以上的例子可以了解到,PostgreSQL优化器的逻辑推理能力还可以加强。
只要能推理出一定为假的,就可以被优化器用于排除表。

参考
1. 分区字段的分区方法,这种方法对应的函数或操作符必须是immutable的,同时尽量以字段加btree operator来分区,方便写SQL,如果做不到,那么SQL中必须带上原样的表达式,同时代入,例如 abs(mod(id,4)) = abs(mod(?,4))  
The following caveats apply to constraint exclusion:

Constraint exclusion only works when the query's WHERE clause contains constants (or externally supplied parameters). For example, a comparison against a non-immutable function such as CURRENT_TIMESTAMP cannot be optimized, since the planner cannot know which partition the function value might fall into at run time.

Keep the partitioning constraints simple, else the planner may not be able to prove that partitions don't need to be visited. Use simple equality conditions for list partitioning, or simple range tests for range partitioning, as illustrated in the preceding examples. A good rule of thumb is that partitioning constraints should contain only comparisons of the partitioning column(s) to constants using B-tree-indexable operators.

All constraints on all partitions of the master table are examined during constraint exclusion, so large numbers of partitions are likely to increase query planning time considerably. Partitioning using these techniques will work well with up to perhaps a hundred partitions; don't try to use many thousands of partitions.
3. constraint_exclusion
constraint_exclusion (enum)
Controls the query planner's use of table constraints to optimize queries. The allowed values of constraint_exclusion are on (examine constraints for all tables), off (never examine constraints), and partition (examine constraints only for inheritance child tables and UNION ALL subqueries). partition is the default setting. It is often used with inheritance and partitioned tables to improve performance.

Flag Counter

TPC-H 使用

$
0
0
[转载]
http://blog.csdn.net/leixingbang1989/article/details/8766047

引言:

      在实验室时候,由于老师需要用到TPC的相关知识,于是让我做一下关于TPC-H的研究。通过百度检索以及相关资料查询,发现目前国内做的相关研究特别少,而且介绍的也非常模糊,而TPC-H的英文使用说明多达两百多页,对于其中重要的QGEN DBGEN的使用根本没有介绍。

     特此写下此文章,希望能帮助初期研究的同学,也希望做TPC方面研究的同学能少走一些弯路。

     本教程主要帮助读者完成以下目标:

     (1)了解TPC-H的由来与简介

     (2)给出dbgen Qgen的使用教程,并在windows操作系统、sqlserver2005数据库环境下生成指定规模大小的数据集,利用脚本导入数据库,建立好表的各种关联关系。

     (3)给出TPC-H的中文文档翻译以及各种优秀论文推荐链接。

TPC简介

事务处理性能委员会( Transaction ProcessingPerformance Council ),是由数10家会员公司创建的非盈利组织,总部设在美国。该组织对全世界开放,但迄今为止,绝大多数会员都是美、日、西欧的大公司。TPC的成员主要是计算机软硬件厂家,而非计算机用户,它的功能是制定商务应用基准程序(Benchmark)的标准规范、性能和价格度量,并管理测试结果的发布。

TPC- C 用于测试数据库系统的事务处理能力,TPC- App 用于测试7×24 环境下B2B 的应用服务和Web 服务的能力。TPC 组织还发布过TPC- S(Server 专门测试基准程序)、TPC- E(大型企业信息服务测试基准程序)和TPC- Client/Server等测试标准,但这3 个标准不被业界接受而被放弃。

TPC不给出基准程序的代码,而只给出基准程序的标准规范(Standard Specification)。任何厂家或其它测试者都可以根据规范,最优地构造出自己的系统(测试平台和测试程序)。(需要自己写测试工具,测试完之后提交给TPC协会)为保证测试结果的客观性,被测试者(通常是厂家)必须提交给TPC一套完整的报告(FullDisclosure Report),包括被测系统的详细配置、分类价格和包含五年维护费用在内的总价格。该报告必须由TPC授权的审核员核实(TPC本身并不做审计),现在全球只有不到十个审核员,全部在美国。(测试价格昂贵的原因)

TPC目前推出的基准程序

TPC推出过11套基准程序,分别是正在使用的TPC-App、TPC-H、TPC-C、TPC-W,过时的TPC-A、TPC-B、TPC-D和TPC-R,以及因为不被业界接受而放弃的TPC-S(Server专门测试基准程序)、TPC-E(大型企业信息服务测试基准程序)和TPC-Client/Server。

这里重点介绍TCP-H。

TPC-H的目的

TPC- H 主要目的是评价特定查询的决策支持能力,强调服务器在数据挖掘、分析处理方面的能力。查询是决策支持应用的最主要应用之一,数据仓库中的复杂查询可以分成两种类型:一种是预先知道的查询,如定期的业务报表;另一种则是事先未知的查询,称为动态查询(Ad- Hoc Query)。

通俗的讲,TPC-H就是当一家数据库开发商开发了一个新的数据库操作系统,采用TpC-H作为测试基准,来测试衡量数据库操作系统查询决策支持方面的能力.

TPC-H的衡量指标

它模拟决策支持系统中的数据库操作,测试数据库系统复杂查询的响应时间,以每小时执行的查询数(TPC-H QphH@Siz)作为度量指标.

TPC-H标准规范

TPC- H 标准规范由10 章正文和5 个附录组成。详细内容见:tpch2.14.4.docx

数据库运行的环境条件

TPC- H 测试模型为数据库服务器连续7×24 小时工作,可能只有1 次/月的维护;多用户并发执行复杂的动态查询,同时有并发执行表修改操作。数据库模型见图1,共有8 张表,除Nation 和Region 表外,其它表与测试的数据量有关,即比例因SF(Scale Factor)

数据库关系图以及表各个字段定义如下图

具体表中各个字段的定义,请参阅TPC-H标准规范。

 

 

数据库模型

 

数据量规定

 

由于数据量的大小对查询速度有直接的影响,TPC- H 标准对数据库系统中的数据量有严格、明确的规定。用SF 描述数据量,1SF 对应1 GB 单位,SF 由低到高依次是1、10、30、100、300、1 000、3 000、10 000。需要强调,SF 规定的数据量只是8个基本表的数据量,不包括索引和临时表。

从TPC- H 测试全程来看,需要的数据存储空较大,一般包括有基本表、索引、临时表、数据文件和备份文件,基本表的大小为x;索引和临时空间的经验值为3-5 位,取上限5x;DBGEN产生的数据文件的大小为x;备份文件大小为x;总计需要的存储空间为8x。就是说SF=1,需要准备8 倍,即8 GB 存储空间,才能顺利地进行测试

22 个查询语句

 TPC- H 测试围绕22 个SELECT 语句展开,每个SELECT严格定义,遵守SQL- 92语法,并且不允许用户修改。标准中从4 个方面定义每个SELECT 语句,即商业问题、SELECT 的语法、参数和查询确认。这些SELECT 语句的复杂程度超过大多数实际的OLTP 应用,一个SELECT 执行时间少则几十秒,多则达15 小时以上,22 个查询语句执行一遍需数个小时。

2 个更新操作

为了逼真地模拟数据仓库的实际应用环境,在22 个查询执行的同时,还有一对更新操作RF1 和RF2 并发地执行。RF1向Order 表和Lineitem 表中插入原行数的0.1%的新行,模拟新销售业务的数据加入到数据库中;RF2 从Order 表和Lineitem表中删除等量与RF1 增加的数据,模拟旧的销售数据被淘汰。RF1 和RF2 的执行必须保证数据库的ACID 约束,并保持测试前后的数据库中的数据量不变。更新操作除输出成功或失败信息外,不产生其它输出信息

3 个测试

 TPC-H 测试分解为3 个子测试:数据装载测试、Power 测试和Throughput 测试。建立测试数据库的过程被称为装载数据,装载测试是为测试DBMS 装载数据的能力。装载测试是第一项测试,测试装载数据的时间,这项操作非常耗时。Power 测试是在数据装载测试完成后,数据库处于初始状态,未进行其它任何操作,特别是缓冲区还没有被测试数据库的数据,被称为raw查询。Power 测试要求22 个查询顺序执行1 遍,同时执行一对RF1 和RF2 操作。最后进行Throughput 测试,也是最核心和最复杂的测试,它更接近于实际应用环境,与Power 测试比对SUT 系统的压力有非常大的增加,有多个查询语句组,同时有一对RF1 和RF2 更新流。

度量指标

  测试中测量的基础数据都与执行时间有关,这些时间又可分为:装载数据的每一步操作时间、每个查询执行时间和每个更新操作执行时间,由这些时间可计算出:数据装载时间、Power@Size、Throughput@Size、QphH@Size 和$/QphH@Size。

装载数据时间

装载数据的全过程有记时操作和不记时操作之分,记时操作必须测量所用时间,并计入到数据装载时间中。一般情况下,需要记时的操作有建表、插入数据和建立索引。

查询和更新时间

在Power 测试和Throughput 测试中所有查询和更新流的时间必须被测量和记录,每个查询时间的计时是从被提交查询的第一个字符开始到获得查询结果最后一个字符的时间为止。更新时间要分别测量RF1 和RF2 的时间,是从提交操作开始到完成操作结束的时间。

Power@Size

Power@Size 是Power 测试的结果,被定义为查询时间和更改时间的几何平均值的倒数,公式如下:

 

其中:Size 为数据规模;SF 为数据规模的比例因子;QI (i,0)为第 i个查询的时间,以秒为单位;R(I j,0)为 RFj更新的时间,以秒为单位。

Throughput@Size

 Throughput@Size 是Throughput 测试的结果,被定义为所有查询执行时间平均值的倒数,公式如下:

 

 

 

TPC- H 标准的附录D,有两组ANSI C 语言源程序包,即DBGEN 和QGEN。DBGEN 用于产生被测试数据,用户通过命令行参数控制执行结果。QGEN 用于生产测试所需要的22 个SELECT、RF1 和RD2 两个更新操作。

DBGEN 和QGEN 程序

   TPC- H 标准的附录D,有两组ANSI C 语言源程序包,即DBGEN 和QGEN。DBGEN 用于产生被测试数据,用户通过命令行参数控制执行结果。QGEN 用于生产测试所需要的22 个SELECT、RF1 和RD2 两个更新操作。

n      详细使用说明

    http://www.geniiius.com/blog/generate-test-data-using-dbgen

 

 

你一步一步如何使用该工具来生成一个数据库,一套表的数据量足够大,能够证明性能优化的东西。

1。 下载DBGEN

下载,DBGENhttp://www.tpc.org/tpch/spec/tpch_2_14_3.zip和提取它。 在本指南中,我的zip文件解压到C\ tpch_2_14_3 \


 
这是项目,我们需要建立可执行文件。

2。 构建解决方案

现在,打开C\ \ DBGEN \ tpch.slnVisual Studiotpch_2_14_3 根据您的Visual Studio版本,你可能会面临着一个转换向导。 只需单击完成来执行转换。 所有你需要做的,是要建立完整的解决方案。 我有一些错误,因为一些锁定的文件,所以我必须手动删除所有文件从C\ tpch_2_14_3 \ DBGEN \Debug文件夹之前,我可以编译解决方案。 结果是文件C\ tpch_2_14_3 \ DBGEN \调试\ dbgen.exe

3。 生成数据使用dbgen.exe

现在需要执行dbgen.exe 如果我们执行该命令与-h,我们得到一些帮助:


如果我们简单地运行dbgen.exe,默认情况下,生成1 GB的数据,分为8个不同的表(客户,国家,订单项的订单,零件,PARTSUPP,地区,供应商)。 -s参数指定一个比例因子,所以-S 10为我们提供了10GB,和100生成100GB的数据。 让我们尝试缺省的:


哎呀,我们得到了一个错误! 这是为什么,我不知道,但我找到了解决办法很简单:将文件复制dbgen.exe,一个级别,所以它位于C\ tpch_2_14_3 \ DBGEN文件夹,然后再试一次:

-v给出详细的输出。 现在,它生成的文件为每个表。 根据您的系统的速度,这可能需要几分钟的时间。

生成的文件将位于相同的目录中dbgen.exe 生成的文件的列表是:


生成tbl文件后需要建立表,将数据导入,在数据库中建立表与表之间的关系。

在这里我将建表以及导入8个表的sql语句写出(仅适用于sqlserver2005其他数据库请具体参阅相关的产品的sql语句)

使用方式:

1》   创建数据库(名字自己定)

2》   在此数据库下,新建查询,将下列语句拷贝到对话框中,点击执行,便可以生成数据库(表结构,表之间的主外键关系都有了)。

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[REGION]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[REGION](

    [R_REGIONKEY] [int] NOT NULL,

    [R_NAME] [char](25)NOTNULL,

    [R_COMMENT] [varchar](152)NULL,

 CONSTRAINT[PK_REGION] PRIMARY KEYCLUSTERED

(

    [R_REGIONKEY] ASC

)WITH(PAD_INDEX =OFF,STATISTICS_NORECOMPUTE =OFF, IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON,ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[PART]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[PART](

    [P_PARTKEY] [int] NOT NULL,

    [P_NAME] [varchar](55)NOTNULL,

    [P_MFGR] [char](25)NOTNULL,

    [P_BRAND] [char](10)NOTNULL,

    [P_TYPE] [varchar](25)NOTNULL,

    [P_SIZE] [int] NOT NULL,

    [P_CONTAINER] [char](10)NOTNULL,

    [P_RETAILPRICE] [decimal](15, 2)NOTNULL,

    [P_COMMENT] [varchar](23)NOTNULL,

 CONSTRAINT[PK_PART] PRIMARY KEYCLUSTERED

(

    [P_PARTKEY] ASC

)WITH(PAD_INDEX =OFF,STATISTICS_NORECOMPUTE =OFF,IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON,ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[ORDERS]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[ORDERS](

    [O_ORDERKEY] [int] NOT NULL,

    [O_CUSTKEY] [int] NOT NULL,

    [O_ORDERSTATUS] [char](1)NOTNULL,

    [O_TOTALPRICE] [decimal](15, 2)NOTNULL,

    [O_ORDERDATE] [datetime] NOT NULL,

    [O_ORDERPRIORITY] [char](15)NOTNULL,

    [O_CLERK] [char](15)NOTNULL,

    [O_SHIPPRIORITY] [int] NOT NULL,

    [O_COMMENT] [varchar](79)NOTNULL,

 CONSTRAINT[PK_ORDERS_1] PRIMARY KEYCLUSTERED

(

    [O_ORDERKEY] ASC

)WITH(PAD_INDEX =OFF,STATISTICS_NORECOMPUTE =OFF,IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON,ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[LINEITEM]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[LINEITEM](

    [L_ORDERKEY] [int] NOT NULL,

    [L_PARTKEY] [int] NOT NULL,

    [L_SUPPKEY] [int] NOT NULL,

    [L_LINENUMBER] [int] NOT NULL,

    [L_QUANTITY] [decimal](15, 2)NOTNULL,

    [L_EXTENDEDPRICE] [decimal](15, 2)NOTNULL,

    [L_DISCOUNT] [decimal](15, 2)NOTNULL,

    [L_TAX] [decimal](15, 2)NOTNULL,

    [L_RETURNFLAG] [char](1)NOTNULL,

    [L_LINESTATUS] [char](1)NOTNULL,

    [L_SHIPDATE] [datetime] NOT NULL,

    [L_COMMITDATE] [datetime] NOTNULL,

    [L_RECEIPTDATE] [datetime] NOTNULL,

    [L_SHIPINSTRUCT] [char](25)NOTNULL,

    [L_SHIPMODE] [char](10)NOTNULL,

    [L_COMMENT] [varchar](44)NOTNULL

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[CUSTOMER]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[CUSTOMER](

    [C_CUSTKEY] [int] NOT NULL,

    [C_NAME] [varchar](25)NOTNULL,

    [C_ADDRESS] [varchar](40)NOTNULL,

    [C_NATIONKEY] [int] NOT NULL,

    [C_PHONE] [char](15)NOTNULL,

    [C_ACCTBAL] [decimal](15, 2)NOTNULL,

    [C_MKTSEGMENT] [char](10)NOTNULL,

    [C_COMMENT] [varchar](117)NOTNULL,

 CONSTRAINT[PK_CUSTOMER] PRIMARY KEYCLUSTERED

(

    [C_CUSTKEY] ASC

)WITH(PAD_INDEX =OFF,STATISTICS_NORECOMPUTE =OFF,IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON,ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[SUPPLIER]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[SUPPLIER](

    [S_SUPPKEY] [int] NOT NULL,

    [S_NAME] [char](25)NOTNULL,

    [S_ADDRESS] [varchar](40)NOTNULL,

    [S_NATIONKEY] [int] NOT NULL,

    [S_PHONE] [char](15)NOTNULL,

    [S_ACCTBAL] [decimal](15, 2)NOTNULL,

    [S_COMMENT] [varchar](101)NOTNULL,

 CONSTRAINT[PK_SUPPLIER] PRIMARY KEYCLUSTERED

(

    [S_SUPPKEY] ASC

)WITH(PAD_INDEX =OFF,STATISTICS_NORECOMPUTE =OFF,IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON,ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[NATION]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[NATION](

    [N_NATIONKEY] [int] NOT NULL,

    [N_NAME] [char](25)NOTNULL,

    [N_REGIONKEY] [int] NOT NULL,

    [N_COMMENT] [varchar](152)NULL,

 CONSTRAINT[PK_NATION] PRIMARY KEYCLUSTERED

(

    [N_NATIONKEY] ASC

)WITH(PAD_INDEX =OFF, STATISTICS_NORECOMPUTE =OFF, IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON, ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

SET ANSI_NULLSON

GO

SET QUOTED_IDENTIFIERON

GO

IF NOTEXISTS(SELECT*FROMsys.objectsWHEREobject_id=OBJECT_ID(N'[dbo].[PARTSUPP]')ANDtypein(N'U'))

BEGIN

CREATE TABLE [dbo].[PARTSUPP](

    [PS_PARTKEY] [int] NOT NULL,

    [PS_SUPPKEY] [int] NOT NULL,

    [PS_AVAILQTY] [int] NOT NULL,

    [PS_SUPPLYCOST] [decimal](15, 2)NOTNULL,

    [PS_COMMENT] [varchar](199)NOTNULL,

 CONSTRAINT[PK_PARTSUPP] PRIMARY KEYCLUSTERED

(

    [PS_PARTKEY] ASC,

    [PS_SUPPKEY] ASC

)WITH(PAD_INDEX =OFF,STATISTICS_NORECOMPUTE =OFF,IGNORE_DUP_KEY=OFF,ALLOW_ROW_LOCKS =ON,ALLOW_PAGE_LOCKS =ON)ON [PRIMARY]

) ON [PRIMARY]

END

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_ORDERS_CUSTOMER]')ANDparent_object_id=OBJECT_ID(N'[dbo].[ORDERS]'))

ALTER TABLE [dbo].[ORDERS] WITHCHECKADD CONSTRAINT[FK_ORDERS_CUSTOMER]FOREIGNKEY([O_CUSTKEY])

REFERENCES [dbo].[CUSTOMER]([C_CUSTKEY])

GO

ALTER TABLE [dbo].[ORDERS]CHECKCONSTRAINT [FK_ORDERS_CUSTOMER]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_LINEITEM_ORDERS]')ANDparent_object_id=OBJECT_ID(N'[dbo].[LINEITEM]'))

ALTER TABLE [dbo].[LINEITEM] WITHNOCHECKADD CONSTRAINT[FK_LINEITEM_ORDERS]FOREIGNKEY([L_ORDERKEY])

REFERENCES [dbo].[ORDERS]([O_ORDERKEY])

GO

ALTER TABLE [dbo].[LINEITEM]CHECKCONSTRAINT [FK_LINEITEM_ORDERS]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_LINEITEM_PARTSUPP]')ANDparent_object_id=OBJECT_ID(N'[dbo].[LINEITEM]'))

ALTER TABLE [dbo].[LINEITEM] WITHNOCHECKADD CONSTRAINT[FK_LINEITEM_PARTSUPP]FOREIGNKEY([L_PARTKEY],[L_SUPPKEY])

REFERENCES [dbo].[PARTSUPP]([PS_PARTKEY],[PS_SUPPKEY])

GO

ALTER TABLE [dbo].[LINEITEM]CHECKCONSTRAINT [FK_LINEITEM_PARTSUPP]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_CUSTOMER_NATION]')ANDparent_object_id=OBJECT_ID(N'[dbo].[CUSTOMER]'))

ALTER TABLE [dbo].[CUSTOMER] WITHCHECKADD CONSTRAINT[FK_CUSTOMER_NATION]FOREIGNKEY([C_NATIONKEY])

REFERENCES [dbo].[NATION]([N_NATIONKEY])

GO

ALTER TABLE [dbo].[CUSTOMER]CHECKCONSTRAINT [FK_CUSTOMER_NATION]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_SUPPLIER_NATION]')ANDparent_object_id=OBJECT_ID(N'[dbo].[SUPPLIER]'))

ALTER TABLE [dbo].[SUPPLIER] WITHCHECKADD CONSTRAINT[FK_SUPPLIER_NATION]FOREIGNKEY([S_NATIONKEY])

REFERENCES [dbo].[NATION]([N_NATIONKEY])

GO

ALTER TABLE [dbo].[SUPPLIER]CHECKCONSTRAINT [FK_SUPPLIER_NATION]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_NATION_REGION]')ANDparent_object_id=OBJECT_ID(N'[dbo].[NATION]'))

ALTER TABLE [dbo].[NATION] WITHCHECKADD CONSTRAINT[FK_NATION_REGION]FOREIGNKEY([N_REGIONKEY])

REFERENCES [dbo].[REGION]([R_REGIONKEY])

GO

ALTER TABLE [dbo].[NATION]CHECKCONSTRAINT [FK_NATION_REGION]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_PARTSUPP_PART]')ANDparent_object_id=OBJECT_ID(N'[dbo].[PARTSUPP]'))

ALTER TABLE [dbo].[PARTSUPP] WITHCHECKADD CONSTRAINT[FK_PARTSUPP_PART]FOREIGNKEY([PS_PARTKEY])

REFERENCES [dbo].[PART]([P_PARTKEY])

GO

ALTER TABLE [dbo].[PARTSUPP]CHECKCONSTRAINT [FK_PARTSUPP_PART]

GO

IF NOTEXISTS(SELECT*FROMsys.foreign_keysWHEREobject_id=OBJECT_ID(N'[dbo].[FK_PARTSUPP_SUPPLIER]')ANDparent_object_id=OBJECT_ID(N'[dbo].[PARTSUPP]'))

ALTER TABLE [dbo].[PARTSUPP] WITHCHECKADD CONSTRAINT[FK_PARTSUPP_SUPPLIER]FOREIGNKEY([PS_SUPPKEY])

REFERENCES [dbo].[SUPPLIER]([S_SUPPKEY])

GO

ALTER TABLE [dbo].[PARTSUPP]CHECKCONSTRAINT [FK_PARTSUPP_SUPPLIER]

3

dbgen生成的 8个表的.tbl文件导入到数据库中。

1.              GO  
2.              BULK INSERT part FROM 'C:\tpch_2_14_3\dbgen\part.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
3.              BULK INSERT customer FROM 'C:\tpch_2_14_3\dbgen\customer.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
4.              BULK INSERT orders FROM 'C:\tpch_2_14_3\dbgen\orders.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
5.              BULK INSERT partsupp FROM 'C:\tpch_2_14_3\dbgen\partsupp.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
6.              BULK INSERT supplier FROM 'c:\tpch_2_14_3\dbgen\supplier.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
7.              BULK INSERT lineitem FROM 'C:\tpch_2_14_3\dbgen\lineitem.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
8.              BULK INSERT nation FROM 'C:\tpch_2_14_3\dbgen\nation.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  
9.              BULK INSERT region FROM 'C:\tpch_2_14_3\dbgen\region.tbl' WITH (TABLOCK, DATAFILETYPE='char', CODEPAGE='raw', FIELDTERMINATOR = '|')  

至此为止,dbgen生成的数据库全部内容已经导入到DBMS中。接下来,我们需要生成sql语句。这就需要用到qgen.

关于qgen的使用类似于dbgen

首先我们打开22个查询语句中的第一个

看到:

Qgen的其中一大作用就是将22条查询语句中的冒号替换为查询的数据,通过此方式来测试数据库的性能。

我们首先用qgen来生成第一条查询语句,并将其保存到上级目录的saveSql中。

运行

qgen -d 1>..\saveSql\1.sql

在saveSql文件夹中,可以看到已经有一个1.sql语句,打开看一下。

select

    l_returnflag,

    l_linestatus,

    sum(l_quantity)as sum_qty,

    sum(l_extendedprice)as sum_base_price,

    sum(l_extendedprice*(1- l_discount))as sum_disc_price,

    sum(l_extendedprice*(1- l_discount)*(1+ l_tax))as sum_charge,

    avg(l_quantity)as avg_qty,

    avg(l_extendedprice)as avg_price,

    avg(l_discount)as avg_disc,

    count(*)as count_order

from

    lineitem

where

    l_shipdate <= date '1998-12-01' - interval '90' day (3)

group by

    l_returnflag,

    l_linestatus

order by

    l_returnflag,

    l_linestatus;

set rowcount-1

注意到 原来的“:”已经被替换为90.


接下来的工作我们需要将保存的sql语句,利用自己开发的测试工具来进行测试。

对于如何编写测试测试工具,我在这里不做介绍。给大家推荐几个国内较好论文,希望有所帮助。

http://wenku.baidu.com/view/46a99819a76e58fafab0035d.html

http://wenku.baidu.com/view/6df300aad1f34693daef3e18.html

http://wenku.baidu.com/view/b649f3c39ec3d5bbfd0a7454.html

TPC-H中文文档翻译(2.0.0.0版本)

http://wenku.baidu.com/view/024e682cbd64783e09122bf4.html

PostgreSQL 列存储,支持COPY跳过异常行的功能测试

$
0
0
susql在PostgreSQL9.5基础之上,增加了支持列存储,支持zlib压缩,支持COPY跳过异常行的功能。
可以在以下WEB下载测试。
https://github.com/susql/susql/releases
感谢志铭为PG社区的付出。

新增功能扩展语法介绍:
Synopsis
CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name ( [
  { column_name data_type [ COLLATE collation ] [ COMPRESS compression ] [ column_constraint [ ... ] ]
    | table_constraint
    | LIKE source_table [ like_option ... ] 
    | COMPRESS default_compression }
    [, ... ]
] )
[ INHERITS ( parent_table [, ... ] ) ]
[ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
[ TABLESPACE tablespace_name ]
[ STORED AS { HEAP | ORC } ]
[ SORT BY { column_name [,...]}]
Parameters

STORED AS
HEAP: row storage format(postgres heap)(default).
ORC: column storage format.

SORT BY
When table is STORED AS ORC, sort the column for reading optimize.

COMPRESS
To set column compression type for a table. COMPRESS default_compression is for default compression type if individual column compression is not specific.
There is a buildin compression type PGLZ, and an extension compression type ZLIB(create extension dc_zlib fistly to use ZLIB compression).
注意insert需要vacuum后才能转换到列存储, copy进来的数据不需要转换直接进 of 文件(即列存)。

COPY table_name [ ( column_name [, ...] ) ]
    FROM { 'filename' | PROGRAM 'command' | DIRECTORY 'directory_name' [ RECURSIVE { 'NUMBER' } ] | STDIN }
    [ [ WITH ] ( option [, ...] ) ]

COPY { table_name [ ( column_name [, ...] ) ] | ( query ) }
    TO { 'filename' | PROGRAM 'command' | STDOUT }
    [ [ WITH ] ( option [, ...] ) ]

where option can be one of:
    FORMAT format_name
    OIDS [ boolean ]
    FREEZE [ boolean ]
    DELIMITER 'delimiter_character'
    NULL 'null_string'
    HEADER [ boolean ]
    QUOTE 'quote_character'
    ESCAPE 'escape_character'
    FORCE_QUOTE { ( column_name [, ...] ) | * }
    FORCE_NOT_NULL ( column_name [, ...] )
    FORCE_NULL ( column_name [, ...] )
    ENCODING 'encoding_name'
    UNSTRICT [ boolean ]
    UNSTRICT_NUM { number }

Parameters
DIRECTORY Indicates the input is a directory and will copy all file in this directory.
RECURSIVE {NUMBER} Specifies if the recusive directory copy will be applied.
UNSTRICT Specifies that whether continue copy process when some line is broken.
UNSTRICT_NUM Specifies how many broken lines reach to stop the copy process. Only effects when UNSTRICT is set.

通过create compress可以创建压缩方法。
postgres=# create compress configuration zlib2 (template=zlib, level=9);
CREATE COMPRESS CONFIGURATION
Time: 9.996 ms

postgres=# create compress configuration pglz2 (template=pglz, ...);

pglz用的参数如下:
 * PGLZ_Strategy -
 *
 *              Some values that control the compression algorithm.
 *
 *              min_input_size          Minimum input data size to consider compression.
 *
 *              max_input_size          Maximum input data size to consider compression.
 *
 *              min_comp_rate           Minimum compression rate (0-99%) to require.
 *                                                      Regardless of min_comp_rate, the output must be
 *                                                      smaller than the input, else we don't store
 *                                                      compressed.
 *
 *              first_success_by        Abandon compression if we find no compressible
 *                                                      data within the first this-many bytes.
 *
 *              match_size_good         The initial GOOD match size when starting history
 *                                                      lookup. When looking up the history to find a
 *                                                      match that could be expressed as a tag, the
 *                                                      algorithm does not always walk back entirely.
 *                                                      A good match fast is usually better than the
 *                                                      best possible one very late. For each iteration
 *                                                      in the lookup, this value is lowered so the
 *                                                      longer the lookup takes, the smaller matches
 *                                                      are considered good.
 *
 *              match_size_drop         The percentage by which match_size_good is lowered
 *                                                      after each history check. Allowed values are
 *                                                      0 (no change until end) to 100 (only check
 *                                                      latest history entry at all).

安装:
[root@digoal ~]# rpm -ivh susql50-5.0.1-0.el6.x86_64.rpm 
error: Failed dependencies:
        susql50-libs(x86-64) = 5.0.1-0.el6 is needed by susql50-5.0.1-0.el6.x86_64
[root@digoal ~]# rpm -ivh susql50-libs-5.0.1-0.el6.x86_64.rpm 
Preparing...                ########################################### [100%]
   1:susql50-libs           ########################################### [100%]
/sbin/ldconfig: /opt/gcc4.9.3/lib/libstdc++.so.6.0.20-gdb.py is not an ELF file - it has the wrong magic bytes at the start.

/sbin/ldconfig: /opt/gcc4.9.3/lib64/libstdc++.so.6.0.20-gdb.py is not an ELF file - it has the wrong magic bytes at the start.

[root@digoal ~]# rpm -ivh susql50-5.0.1-0.el6.x86_64.rpm 
Preparing...                ########################################### [100%]
   1:susql50                ########################################### [100%]
[root@digoal ~]# rpm -ivh susql50-server-5.0.1-0.el6.x86_64.rpm 
Preparing...                ########################################### [100%]
   1:susql50-server         ########################################### [100%]
/sbin/ldconfig: /opt/gcc4.9.3/lib/libstdc++.so.6.0.20-gdb.py is not an ELF file - it has the wrong magic bytes at the start.

/sbin/ldconfig: /opt/gcc4.9.3/lib64/libstdc++.so.6.0.20-gdb.py is not an ELF file - it has the wrong magic bytes at the start.

error reading information on service susql-5.0: No such file or directory
[root@digoal ~]# rpm -ivh susql50-contrib-5.0.1-0.el6.x86_64.rpm 
Preparing...                ########################################### [100%]
   1:susql50-contrib        ########################################### [100%]

初始化数据库
[root@digoal ~]# mkdir /data01/susql
[root@digoal ~]# chown susql:susql /data01/susql
[root@digoal ~]# su - susql

-bash-4.1$ vi env_pg.sh
export PS1="$USER@`/bin/hostname -s`-> "
export PGPORT=1923
export PGDATA=/data01/susql/pgdata/pg_root
export LANG=en_US.utf8
export PGHOME=/usr/susql-5.0
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGHOST=$PGDATA
export PGUSER=postgres
export PGDATABASE=postgres
alias rm='rm -i'
alias ll='ls -lh'
unalias vi

[root@digoal ~]# su - susql
-bash-4.1$ . ./env_pg.sh 
-bash: unalias: vi: not found
susql@digoal-> initdb -D $PGDATA -E UTF8 --locale=C -U postgres -W
The files belonging to this database system will be owned by user "susql".
This user must also own the server process.

The database cluster will be initialized with locale "C".
The default text search configuration will be set to "english".

Data page checksums are disabled.

creating directory /data01/susql/pgdata/pg_root ... ok
creating subdirectories ... ok
selecting default max_connections ... 100
selecting default shared_buffers ... 128MB
selecting dynamic shared memory implementation ... posix
creating configuration files ... ok
creating template1 database in /data01/susql/pgdata/pg_root/base/1 ... ok
initializing pg_authid ... ok
Enter new superuser password: 
Enter it again: 
setting password ... ok
initializing dependencies ... ok
creating system views ... ok
loading system objects' descriptions ... ok
creating collations ... ok
creating conversions ... ok
creating dictionaries ... ok
setting privileges on built-in objects ... ok
creating information schema ... ok
loading PL/pgSQL server-side language ... ok
vacuuming database template1 ... ok
copying template1 to template0 ... ok
copying template1 to postgres ... ok
syncing data to disk ... ok

WARNING: enabling "trust" authentication for local connections
You can change this by editing pg_hba.conf or using the option -A, or
--auth-local and --auth-host, the next time you run initdb.

Success. You can now start the database server using:

    pg_ctl -D /data01/susql/pgdata/pg_root -l logfile start


susql@digoal-> cd $PGDATA

修改配置文件
susql@digoal-> grep "^[a-z]" postgresql.conf
listen_addresses = '0.0.0.0'            # what IP address(es) to listen on;
port = 1923                             # (change requires restart)
max_connections = 100                   # (change requires restart)
unix_socket_directories = '.'   # comma-separated list of directories
unix_socket_permissions = 0700          # begin with 0 to use octal notation
tcp_keepalives_idle = 70                # TCP_KEEPIDLE, in seconds;
tcp_keepalives_interval = 10            # TCP_KEEPINTVL, in seconds;
tcp_keepalives_count = 10               # TCP_KEEPCNT;
shared_buffers = 4096MB                 # min 128kB
dynamic_shared_memory_type = posix      # the default is the first option
bgwriter_delay = 10ms                   # 10-10000ms between rounds
bgwriter_lru_maxpages = 1000            # 0-1000 max buffers written/round
synchronous_commit = off                # synchronization level;
full_page_writes = off                  # recover from partial page writes
wal_compression = off                   # enable compression of full-page writes
wal_log_hints = off                     # also do full page writes of non-critical updates
wal_buffers = 16MB                      # min 32kB, -1 sets based on shared_buffers
wal_writer_delay = 10ms         # 1-10000 milliseconds
checkpoint_timeout = 35min              # range 30s-1h
max_wal_size = 4GB
checkpoint_completion_target = 0.9      # checkpoint target duration, 0.0 - 1.0
effective_cache_size = 4GB
log_destination = 'csvlog'              # Valid values are combinations of
logging_collector = on                  # Enable capturing of stderr and csvlog
log_directory = 'pg_log'                # directory where log files are written,
log_filename = 'postgresql-%a.log'      # log file name pattern,
log_truncate_on_rotation = on           # If on, an existing log file with the
log_rotation_age = 1d                   # Automatic rotation of logfiles will
log_rotation_size = 0                   # Automatic rotation of logfiles will
log_checkpoints = on
log_connections = on
log_disconnections = on
log_error_verbosity = verbose            # terse, default, or verbose messages
log_timezone = 'PRC'
log_autovacuum_min_duration = 0 # -1 disables, 0 logs all actions and
datestyle = 'iso, mdy'
timezone = 'PRC'
lc_messages = 'C'                       # locale for system error message
lc_monetary = 'C'                       # locale for monetary formatting
lc_numeric = 'C'                        # locale for number formatting
lc_time = 'C'                           # locale for time formatting
default_text_search_config = 'pg_catalog.english'
enable_copyto_flush = on                #When set, Copy To operation will explicit flush the data when finishing
copy_extension = on                     #When set, Copy Extension Feature Enabled

测试
susql@digoal-> psql
psql (9.5.0 (SuSQL 5.0.1))
Type "help" for help.
postgres=# create extension dc_zlib;
CREATE EXTENSION
postgres=# create table test_col(
id int, 
info text compress zlib, 
crt_time timestamp, 
c1 int, 
c2 int, 
c3 int, 
c4 int, 
c5 int, 
c6 int, 
c7 int, 
c8 int, 
c9 int, 
c10 int, 
c11 int, 
c12 int) 
stored as orc 
sort by id;

postgres=# create table test_heap(
id int, 
info text compress zlib, 
crt_time timestamp, 
c1 int, 
c2 int, 
c3 int, 
c4 int, 
c5 int, 
c6 int, 
c7 int, 
c8 int, 
c9 int, 
c10 int, 
c11 int, 
c12 int) 
stored as heap;

postgres=# create table test_heap_pglz(
id int, 
info text compress pglz, 
crt_time timestamp, 
c1 int, 
c2 int, 
c3 int, 
c4 int, 
c5 int, 
c6 int, 
c7 int, 
c8 int, 
c9 int, 
c10 int, 
c11 int, 
c12 int) 
stored as heap;


postgres=# \d+++ test_heap
                                         Table "public.test_heap"
  Column  |            Type             | Modifiers | Storage  | Compression | Stats target | Description 
----------+-----------------------------+-----------+----------+-------------+--------------+-------------
 id       | integer                     |           | plain    |             |              | 
 info     | text                        |           | extended | zlib        |              | 
 crt_time | timestamp without time zone |           | plain    |             |              | 
 c1       | integer                     |           | plain    |             |              | 
 c2       | integer                     |           | plain    |             |              | 
 c3       | integer                     |           | plain    |             |              | 
 c4       | integer                     |           | plain    |             |              | 
 c5       | integer                     |           | plain    |             |              | 
 c6       | integer                     |           | plain    |             |              | 
 c7       | integer                     |           | plain    |             |              | 
 c8       | integer                     |           | plain    |             |              | 
 c9       | integer                     |           | plain    |             |              | 
 c10      | integer                     |           | plain    |             |              | 
 c11      | integer                     |           | plain    |             |              | 
 c12      | integer                     |           | plain    |             |              | 
Stored As: HEAP

postgres=# \d+++ test_col
                                         Table "public.test_col"
  Column  |            Type             | Modifiers | Storage  | Compression | Stats target | Description 
----------+-----------------------------+-----------+----------+-------------+--------------+-------------
 id       | integer                     |           | plain    |             |              | 
 info     | text                        |           | extended | zlib        |              | 
 crt_time | timestamp without time zone |           | plain    |             |              | 
 c1       | integer                     |           | plain    |             |              | 
 c2       | integer                     |           | plain    |             |              | 
 c3       | integer                     |           | plain    |             |              | 
 c4       | integer                     |           | plain    |             |              | 
 c5       | integer                     |           | plain    |             |              | 
 c6       | integer                     |           | plain    |             |              | 
 c7       | integer                     |           | plain    |             |              | 
 c8       | integer                     |           | plain    |             |              | 
 c9       | integer                     |           | plain    |             |              | 
 c10      | integer                     |           | plain    |             |              | 
 c11      | integer                     |           | plain    |             |              | 
 c12      | integer                     |           | plain    |             |              | 
Stored As: ORC

postgres=# \d+++ test_heap_pglz 
                                      Table "public.test_heap_pglz"
  Column  |            Type             | Modifiers | Storage  | Compression | Stats target | Description 
----------+-----------------------------+-----------+----------+-------------+--------------+-------------
 id       | integer                     |           | plain    |             |              | 
 info     | text                        |           | extended | pglz        |              | 
 crt_time | timestamp without time zone |           | plain    |             |              | 
 c1       | integer                     |           | plain    |             |              | 
 c2       | integer                     |           | plain    |             |              | 
 c3       | integer                     |           | plain    |             |              | 
 c4       | integer                     |           | plain    |             |              | 
 c5       | integer                     |           | plain    |             |              | 
 c6       | integer                     |           | plain    |             |              | 
 c7       | integer                     |           | plain    |             |              | 
 c8       | integer                     |           | plain    |             |              | 
 c9       | integer                     |           | plain    |             |              | 
 c10      | integer                     |           | plain    |             |              | 
 c11      | integer                     |           | plain    |             |              | 
 c12      | integer                     |           | plain    |             |              | 
Stored As: HEAP

性能测试
postgres=# \timing
Timing is on.
postgres=# insert into test_heap select i,repeat(md5(random()::text),64),clock_timestamp(),i,i,i,i,i,i,i,i,i,i,i,i from generate_series(1,1000000) t(i);
INSERT 0 1000000
Time: 18183.374 ms
postgres=# insert into test_col select i,repeat(md5(random()::text),64),clock_timestamp(),i,i,i,i,i,i,i,i,i,i,i,i from generate_series(1,1000000) t(i);
INSERT 0 1000000
Time: 19871.817 ms
postgres=# insert into test_heap select trunc(5000000*random()),repeat(md5(random()::text),64),clock_timestamp(),i,i,i,i,i,i,i,i,i,i,i,i from generate_series(1,1000000) t(i);
INSERT 0 1000000
Time: 20575.763 ms
postgres=# insert into test_col select trunc(5000000*random()),repeat(md5(random()::text),64),clock_timestamp(),i,i,i,i,i,i,i,i,i,i,i,i from generate_series(1,1000000) t(i);
INSERT 0 1000000
Time: 20440.462 ms

postgres=# select count(*) from test_col where id<10;
 count 
-------
    10
(1 row)

Time: 417.611 ms
postgres=# select count(*) from test_col where id<10;
 count 
-------
    10
(1 row)

Time: 386.153 ms
postgres=# select count(*) from test_heap where id<10;
 count 
-------
    12
(1 row)

Time: 204.857 ms
postgres=# select count(*) from test_heap where id<10;
 count 
-------
    12
(1 row)

Time: 210.323 ms
postgres=# select count(*) from test_col ;
  count  
---------
 2000000
(1 row)

Time: 383.868 ms
postgres=# select count(*) from test_heap;
  count  
---------
 2000000
(1 row)
Time: 182.571 ms

postgres=# \dt+ 
                         List of relations
 Schema |      Name      | Type  |  Owner   |  Size  | Description 
--------+----------------+-------+----------+--------+-------------
 public | test_col       | table | postgres | 319 MB | 
 public | test_heap      | table | postgres | 319 MB | 
 public | test_heap_pglz | table | postgres | 319 MB | 
(3 rows)

postgres=# select count(id) from test_heap;
  count  
---------
 2000000
(1 row)

Time: 210.082 ms

postgres=# select count(id) from test_col;
  count  
---------
 2000000
(1 row)

Time: 468.594 ms

postgres=# explain analyze select count(distinct id) from test_col;
                                                        QUERY PLAN                                                         
---------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=8.83..8.84 rows=1 width=4) (actual time=1743.652..1743.652 rows=1 loops=1)
   ->  Custom Scan (ORC) on test_col  (cost=0.00..7.06 rows=706 width=4) (actual time=0.018..431.255 rows=2000000 loops=1)
 Planning time: 0.112 ms
 Execution time: 1743.735 ms
(4 rows)

Time: 1744.215 ms
postgres=# explain analyze select count(distinct id) from test_heap;
                                                        QUERY PLAN                                                         
---------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=65816.71..65816.72 rows=1 width=4) (actual time=1513.416..1513.416 rows=1 loops=1)
   ->  Seq Scan on test_heap  (cost=0.00..60816.77 rows=1999977 width=4) (actual time=0.017..221.881 rows=2000000 loops=1)
 Planning time: 0.068 ms
 Execution time: 1513.478 ms
(4 rows)

Time: 1513.900 ms

postgres=# explain analyze select count(distinct (id,info)) from test_col;
                                                         QUERY PLAN                                                         
----------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=8.83..8.84 rows=1 width=36) (actual time=15137.273..15137.274 rows=1 loops=1)
   ->  Custom Scan (ORC) on test_col  (cost=0.00..7.06 rows=706 width=36) (actual time=0.014..513.342 rows=2000000 loops=1)
 Planning time: 0.127 ms
 Execution time: 15137.395 ms
(4 rows)

Time: 15138.411 ms
postgres=# explain analyze select count(distinct (id,info)) from test_heap;
                                                         QUERY PLAN                                                         
----------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=65816.71..65816.72 rows=1 width=72) (actual time=13865.787..13865.787 rows=1 loops=1)
   ->  Seq Scan on test_heap  (cost=0.00..60816.77 rows=1999977 width=72) (actual time=0.012..235.289 rows=2000000 loops=1)
 Planning time: 0.104 ms
 Execution time: 13865.856 ms
(4 rows)

Time: 13866.441 ms


排序
postgres=# set work_mem='1GB';
postgres=# explain analyze select id from test_heap order by id;
                                                        QUERY PLAN                                                         
---------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=270132.69..275132.69 rows=2000000 width=4) (actual time=766.044..959.705 rows=2000000 loops=1)
   Sort Key: id
   Sort Method: quicksort  Memory: 142903kB
   ->  Seq Scan on test_heap  (cost=0.00..60817.00 rows=2000000 width=4) (actual time=0.012..254.047 rows=2000000 loops=1)
 Planning time: 0.148 ms
 Execution time: 1041.256 ms
(6 rows)

postgres=# explain analyze select * from test_heap order by id;
                                                         QUERY PLAN                                                          
-----------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=270132.69..275132.69 rows=2000000 width=128) (actual time=820.897..1104.178 rows=2000000 loops=1)
   Sort Key: id
   Sort Method: quicksort  Memory: 580403kB
   ->  Seq Scan on test_heap  (cost=0.00..60817.00 rows=2000000 width=128) (actual time=0.017..202.813 rows=2000000 loops=1)
 Planning time: 0.107 ms
 Execution time: 1193.899 ms
(6 rows)

postgres=# explain analyze select * from test_col order by id;
                                                             QUERY PLAN                                                              
-------------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=229315.69..234315.69 rows=2000000 width=128) (actual time=1269.664..1553.249 rows=2000000 loops=1)
   Sort Key: id
   Sort Method: quicksort  Memory: 580403kB
   ->  Custom Scan (ORC) on test_col  (cost=0.00..20000.00 rows=2000000 width=128) (actual time=0.019..417.134 rows=2000000 loops=1)
 Planning time: 0.145 ms
 Execution time: 1646.298 ms
(6 rows)

postgres=# explain analyze select id from test_col order by id;
                                                            QUERY PLAN                                                             
-----------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=229315.69..234315.69 rows=2000000 width=4) (actual time=1001.483..1207.091 rows=2000000 loops=1)
   Sort Key: id
   Sort Method: quicksort  Memory: 142903kB
   ->  Custom Scan (ORC) on test_col  (cost=0.00..20000.00 rows=2000000 width=4) (actual time=0.020..466.365 rows=2000000 loops=1)
 Planning time: 0.112 ms
 Execution time: 1293.078 ms
(6 rows)

postgres=# set work_mem='1MB';
SET
postgres=# explain analyze select id from test_col order by id;
                                                            QUERY PLAN                                                             
-----------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=284006.69..289006.69 rows=2000000 width=4) (actual time=1837.612..2159.556 rows=2000000 loops=1)
   Sort Key: id
   Sort Method: external merge  Disk: 27392kB
   ->  Custom Scan (ORC) on test_col  (cost=0.00..20000.00 rows=2000000 width=4) (actual time=0.019..465.012 rows=2000000 loops=1)
 Planning time: 0.110 ms
 Execution time: 2246.295 ms
(6 rows)

postgres=# explain analyze select id from test_heap order by id;
                                                        QUERY PLAN                                                         
---------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=324823.69..329823.69 rows=2000000 width=4) (actual time=1742.758..2080.076 rows=2000000 loops=1)
   Sort Key: id
   Sort Method: external merge  Disk: 27384kB
   ->  Seq Scan on test_heap  (cost=0.00..60817.00 rows=2000000 width=4) (actual time=0.016..278.976 rows=2000000 loops=1)
 Planning time: 0.079 ms
 Execution time: 2176.640 ms
(6 rows)

验证列存储的sort by id是否起作用,从测试结果来看并没有排序存储。
postgres=# select id from test_col limit 1 offset 1000000;
   id    
---------
 3492623
(1 row)

postgres=# select id from test_col limit 1 offset 1000001;
   id   
--------
 292687
(1 row)

postgres=# select id from test_col limit 1 offset 1000002;
   id    
---------
 3435092
(1 row)


查看物理文件
postgres=# select pg_relation_filepath ('test_heap'::regclass);
 pg_relation_filepath 
----------------------
 base/13245/16436
(1 row)

Time: 0.593 ms
postgres=# select pg_relation_filepath ('test_col'::regclass);
 pg_relation_filepath 
----------------------
 base/13245/16408
(1 row)
Time: 0.456 ms

postgres=# \q
susql@digoal-> cd $PGDATA/base/
susql@digoal-> ll 13245/16436*
-rw------- 1 susql susql 319M Feb 29 09:04 13245/16436
-rw------- 1 susql susql 104K Feb 29 09:04 13245/16436_fsm
susql@digoal-> ll 13245/16408*
-rw------- 1 susql susql 319M Feb 29 09:04 13245/16408
-rw------- 1 susql susql 104K Feb 29 09:04 13245/16408_fsm
-rw------- 1 susql susql 8.0K Feb 29 08:58 13245/16408_of
使用insert插入的数据,没有写入列存储,需要调用vacuum重铸。
postgres=# vacuum test_col ;
VACUUM
-rw------- 1 susql susql    0 Feb 29 12:57 16408
-rw------- 1 susql susql  16K Feb 29 12:57 16408_fsm
-rw------- 1 susql susql 319M Feb 29 12:57 16408_of

重铸后,sort by起作用了。
postgres=# select id from test_heap offset 1000000 limit 10;
   id    
---------
 4668069
 3385308
 3298340
 2234740
 3610229
 2569580
 1705913
  828489
  856546
 1197041
(10 rows)

Time: 123.598 ms
postgres=# select id from test_col offset 1000000 limit 10;
   id   
--------
 990214
 990215
 990216
 990217
 990218
 990219
 990220
 990221
 990222
 990223
(10 rows)

Time: 100.979 ms

查看编译项
susql@digoal-> pg_config
BINDIR = /usr/susql-5.0/bin
DOCDIR = /usr/susql-5.0/doc
HTMLDIR = /usr/susql-5.0/doc
INCLUDEDIR = /usr/susql-5.0/include
PKGINCLUDEDIR = /usr/susql-5.0/include
INCLUDEDIR-SERVER = /usr/susql-5.0/include/server
LIBDIR = /usr/susql-5.0/lib
PKGLIBDIR = /usr/susql-5.0/lib
LOCALEDIR = /usr/susql-5.0/share/locale
MANDIR = /usr/susql-5.0/share/man
SHAREDIR = /usr/susql-5.0/share
SYSCONFDIR = /etc/sysconfig/susql
PGXS = /usr/susql-5.0/lib/pgxs/src/makefiles/pgxs.mk
CONFIGURE = '--enable-rpath' '--prefix=/usr/susql-5.0' '--includedir=/usr/susql-5.0/include' '--mandir=/usr/susql-5.0/share/man' '--datadir=/usr/susql-5.0/share' '--enable-nls' '--with-libxml' '--with-libxslt' '--with-system-tzdata=/usr/share/zoneinfo' '--sysconfdir=/etc/sysconfig/susql' '--docdir=/usr/susql-5.0/doc' 'CFLAGS=-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -DLINUX_OOM_ADJ=0' 'LDFLAGS=-Wl,--as-needed'
CC = gcc
CPPFLAGS = -D_GNU_SOURCE -I/usr/include/libxml2
CFLAGS = -DPGPG -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -DLINUX_OOM_ADJ=0
CFLAGS_SL = -fpic
LDFLAGS = -L../../../src/common -Wl,--as-needed -Wl,--as-needed -Wl,-rpath,'/usr/susql-5.0/lib',--enable-new-dtags
LDFLAGS_EX = 
LDFLAGS_SL = 
LIBS = -lpgcommon -lpgport -lxslt -lxml2 -lz -lreadline -lrt -lcrypt -ldl -lm 
VERSION = PostgreSQL 9.5.0 (SuSQL 5.0.1)


验证copy跳过异常的功能
postgres=# copy test_heap to '/data01/susql/test.csv' ;
COPY 2000000
-rw-r--r-- 1 susql susql 4.1G Feb 29 09:22 test.csv

[root@digoal susql]# echo "err" >> test.csv 
[root@digoal susql]# echo "err" >> test.csv 
[root@digoal susql]# echo "err" >> test.csv 
[root@digoal susql]# echo "err" >> test.csv 
[root@digoal susql]# echo "err" >> test.csv 
[root@digoal susql]# head -n 10 test.csv >> test.csv


postgres=# copy test_col from '/data01/susql/test.csv' with( UNSTRICT true, UNSTRICT_NUM 10);
INFO:  00000: invalid input syntax for integer: "err" (822)
CONTEXT:  COPY test_col, file /data01/susql/test.csv, line 2000001, column id: "err"
LOCATION:  PerformErrorLoggingForCopy, copy.c:5117
INFO:  00000: invalid input syntax for integer: "err" (822)
CONTEXT:  COPY test_col, file /data01/susql/test.csv, line 2000002, column id: "err"
LOCATION:  PerformErrorLoggingForCopy, copy.c:5117
INFO:  00000: invalid input syntax for integer: "err" (822)
CONTEXT:  COPY test_col, file /data01/susql/test.csv, line 2000003, column id: "err"
LOCATION:  PerformErrorLoggingForCopy, copy.c:5117
INFO:  00000: invalid input syntax for integer: "err" (822)
CONTEXT:  COPY test_col, file /data01/susql/test.csv, line 2000004, column id: "err"
LOCATION:  PerformErrorLoggingForCopy, copy.c:5117
INFO:  00000: invalid input syntax for integer: "err" (822)
CONTEXT:  COPY test_col, file /data01/susql/test.csv, line 2000005, column id: "err"
LOCATION:  PerformErrorLoggingForCopy, copy.c:5117
COPY 2000010

压缩比比较:
postgres=# create compress configuration zlib0 (template=zlib, level=1);
CREATE COMPRESS CONFIGURATION
Time: 0.814 ms
postgres=# create compress configuration zlib2 (template=zlib, level=9);
CREATE COMPRESS CONFIGURATION
Time: 0.814 ms
postgres=# create table test_heap_zlib0(                       
id int, 
info text compress zlib0, 
crt_time timestamp, 
c1 int, 
c2 int, 
c3 int, 
c4 int, 
c5 int, 
c6 int, 
c7 int, 
c8 int, 
c9 int, 
c10 int, 
c11 int, 
c12 int) 
stored as heap;
CREATE TABLE
postgres=# create table test_heap_zlib2(                       
id int, 
info text compress zlib2, 
crt_time timestamp, 
c1 int, 
c2 int, 
c3 int, 
c4 int, 
c5 int, 
c6 int, 
c7 int, 
c8 int, 
c9 int, 
c10 int, 
c11 int, 
c12 int) 
stored as heap;
CREATE TABLE
postgres=# insert into test_heap_zlib0 select * from test_heap;
INSERT 0 4000000
postgres=# insert into test_heap_zlib2 select * from test_heap;
INSERT 0 4000000
postgres=# select pg_size_pretty(pg_total_relation_size('test_heap'));
 pg_size_pretty 
----------------
 638 MB
(1 row)

Time: 1.190 ms
postgres=# select pg_size_pretty(pg_total_relation_size('test_heap_zlib0'));
 pg_size_pretty 
----------------
 638 MB
(1 row)

Time: 0.344 ms
postgres=# select pg_size_pretty(pg_total_relation_size('test_heap_zlib2'));
 pg_size_pretty 
----------------
 638 MB
(1 row)


小结:
1. 本测试用例下,pglz和zlib的压缩比一致。
2. 本测试用例下,列存的查询效率并没有比行存的效率高。列存使用了9.5新增的custom scan provider接口,和pgstrom使用GPU的方法类似。
3. 列存储在物理文件上和行存储一致,没有按列分割文件。insert插入的数据需要vacuum后才能转成列存,COPY进来的数据不需要执行vacuum。
4. COPY可以跳过错误的行,并输出行号。

本文测试用到的数据量还比较小,感兴趣的同学可以测试更大数据量,看看列存储的性能表现。
有问题可以找志铭.

PostgreSQL 表字段顺序的 "修改"

$
0
0

在某些场景中,用户可能希望在原有字段的某个位置增加一个字段,例如
alter table test add column c1 int after id;
在id字段后面添加一个字段。
在PostgreSQL中,可以通过sql rewrite来做到同样的功能。
但是必须先了解PostgreSQL的物理存储,在PG中,数据是tuple组织的,每个tuple都是固定的storage layout,即字段存储的物理顺序是固定的,解释时是按照pg_attribute中存储的顺序。
那么怎么能做到用户看到的顺序是可以变的呢?
使用简单视图,即rewrite rule.

postgres=# create table tbl(id int, info text, crt_time timestamp);
CREATE TABLE
Time: 15.285 ms
postgres=# alter table tbl add column c1 int;
ALTER TABLE
Time: 12.872 ms
postgres=# create view v_tbl as select id,info,c1,crt_time from tbl;
CREATE VIEW
Time: 0.889 ms
postgres=# insert into v_tbl values (1,'test',2,now());
INSERT 0 1
Time: 1.208 ms
postgres=# select * from v_tbl
postgres-# ;
 id | info | c1 |          crt_time          
----+------+----+----------------------------
  1 | test |  2 | 2016-02-29 14:07:19.171928
(1 row)

Time: 0.544 ms
postgres=# select * from tbl;
 id | info |          crt_time          | c1 
----+------+----------------------------+----
  1 | test | 2016-02-29 14:07:19.171928 |  2
(1 row)

Time: 0.282 ms
postgres=# select attname,attnum,attisdropped from pg_attribute where attrelid ='tbl'::regclass;
 attname  | attnum | attisdropped 
----------+--------+--------------
 tableoid |     -7 | f
 cmax     |     -6 | f
 xmax     |     -5 | f
 cmin     |     -4 | f
 xmin     |     -3 | f
 ctid     |     -1 | f
 id       |      1 | f
 info     |      2 | f
 crt_time |      3 | f
 c1       |      4 | f
(10 rows)

Time: 0.708 ms
postgres=# alter table tbl drop column info;
ERROR:  cannot drop table tbl column info because other objects depend on it
DETAIL:  view v_tbl depends on table tbl column info
HINT:  Use DROP ... CASCADE to drop the dependent objects too.
Time: 8.794 ms
postgres=# alter table tbl drop column info cascade; 
NOTICE:  drop cascades to view v_tbl
ALTER TABLE
Time: 1.561 ms
postgres=# \d v_t

postgres=# create view v_tbl as select id,c1,crt_time from tbl;
CREATE VIEW
Time: 2.248 ms
postgres=# select attname,attnum,attisdropped from pg_attribute where attrelid ='tbl'::regclass;
           attname            | attnum | attisdropped 
------------------------------+--------+--------------
 tableoid                     |     -7 | f
 cmax                         |     -6 | f
 xmax                         |     -5 | f
 cmin                         |     -4 | f
 xmin                         |     -3 | f
 ctid                         |     -1 | f
 id                           |      1 | f
 ........pg.dropped.2........ |      2 | t
 crt_time                     |      3 | f
 c1                           |      4 | f
(10 rows)

Time: 0.675 ms
postgres=# insert into v_tbl values (1,2,now());
INSERT 0 1
Time: 0.370 ms
postgres=# select * from v_tbl;
 id | c1 |          crt_time          
----+----+----------------------------
  1 |  2 | 2016-02-29 14:07:19.171928
  1 |  2 | 2016-02-29 14:09:18.499834
(2 rows)

Time: 0.295 ms
postgres=# select * from tbl;
 id |          crt_time          | c1 
----+----------------------------+----
  1 | 2016-02-29 14:07:19.171928 |  2
  1 | 2016-02-29 14:09:18.499834 |  2
(2 rows)

Time: 0.375 ms

linux 时间戳转换/dmesg 时间转换

$
0
0
http://blog.csdn.net/buptapple/article/details/8568938

linux时间戳转换

1. 将日期转换成时间戳
$date +%s -d "04/24/2014 15:30:00"
1398324600
2. 将时间戳转换成日期
$date -d @1398324600
Thu Apr 24 15:30:00 CST 2014
3. 将当前日期转换成时间戳
$date +%s
1398765730

dmesg 时间转换

dmesg 输出的格式不易查看,可以通过命令进行转换。

记录如下:

时间查看:

date -d "1970-01-01 UTC `echo "$(date +%s)-$(cat /proc/uptime|cut -f 1 -d' ')+12288812.926194"|bc ` seconds"

/proc/uptime详解

在Linux中,我们常常会使用到uptime命令去看看系统的运行时间,它与一个文件有关,就是/proc/uptime,下面对其进行详细介绍。

View Code BASH
1
2
3
4
master@jay-intel:~$ cat /proc/uptime
6447032.12 48185264.69
master@jay-intel:~$ cat /proc/cpuinfo  | grep processor | wc -l
8

第一列输出的是,系统启动到现在的时间(以秒为单位),这里简记为num1;
第二列输出的是,系统空闲的时间(以秒为单位),这里简记为num2。

注意,很多很多人都知道第二个是系统空闲的时间,但是可能你不知道是,在SMP系统里,系统空闲的时间有时会是系统运行时间的几倍,这是怎么回事呢?
因为系统空闲时间的计算,是把SMP算进去的,就是所你有几个逻辑的CPU(包括超线程)。

系统的空闲率(%) = num2/(num1*N) 其中N是SMP系统中的CPU个数。

从上面我的一台机器上的数据可知,
本机启动到现在的时间长度为:6447032.12 seconds = 74.6 days
空闲率为:48185264.69/(6447032.12*8)=93.4%

系统空闲率越大,说明系统比较闲,可以加重一些负载;而系统空闲率很小,则可能考虑升级本机器硬件或者迁移部分负载到其他机器上。

Some docs from Redhat:
The first number is the total number of seconds the system has been up. The second number is how much of that time the machine has spent idle, in seconds. (Jay’s comments: Please pay attention to SMP system.)

如何搭建阿里云RDS PostgreSQL数据库的物理备库

$
0
0

如何搭建阿里云RDS PostgreSQL数据库的物理备库

用户在阿里云购买了RDS PostgreSQL,如何在自己的机房或者ECS上建立备库?
关于如何构建逻辑备库,在我以前的文章有详细的讲解,所谓逻辑备库,是可以跨版本,甚至仅仅同步一部分相同步的表的备库。
https://yq.aliyun.com/articles/7240

如果用户需要构建一个和RDS PostgreSQL一模一样的备库,则可以通过流复制或者归档来完成。

步骤如下
.1. 准备备库环境
安装64位Linux
安装与RDS PostgreSQL大版本一致的PostgreSQL软件
空间规划
.2. 申请一个replication角色的用户
.3. 配置外网地址(可选)
.4. 配置白名单,测试连通性正常
.5. 下载全量备份集,测试归档的下载接口是否正常
.6. 配置postgresql.conf, recovery.conf
.7. 启动备库,检查是否同步

详细步骤
.1. 准备备库环境
安装CentOS 6.x x64,步骤略。
线上RDS PostgreSQL版本为9.4.x,所以备库环境也需要安装9.4的大版本,我们可以安装9.4.6,关注一下release notes,确保兼容性。
PS,目前阿里RDS PostgreSQL软件还没有下载或开源,如果将来开放下载或开源的话,建议安装阿里云提供的PostgreSQL版本,可以保证兼容性,以及出问题可以找到阿里云的PostgreSQL内核团队修复。

确保与线上版本编译参数一致,包括插件版本。
只需要关注如下

select name,setting from pg_settings;  
 block_size                          | 8192  
 wal_block_size                      | 8192  
 rds_available_extensions            | plpgsql,pg_stat_statements,btree_gin,btree_gist,chkpass,citext,cube,dblink,dict_int,earthdistance,hstore,intagg,intarray,isn,ltree,pgcrypto,pgrowlocks,pg_prewarm,pg_trgm,postgres_fdw,sslinfo,tablefu  
nc,tsearch2,unaccent,postgis,postgis_topology,fuzzystrmatch,postgis_tiger_geocoder,plperl,pltcl,plv8,plls,plcoffee,"uuid-ossp",zhparser,pgrouting,rdkit,pg_hint_plan,pgstattuple  

安装软件

wget https://ftp.postgresql.org/pub/source/v9.4.6/postgresql-9.4.6.tar.bz2  
tar -jxvf postgresql-9.4.6.tar.bz2  
cd postgresql-9.4.6  
./configure --prefix=/home/postgres/pgsql9.4.6 --with-blocksize=8 --with-wal-blocksize=8   
gmake -j 32 world  
gmake install-world  

配置环境变量

vi ~/env_pg.sh  
# add by digoal  
export PS1="$USER@`/bin/hostname -s`-> "  
export PGPORT=1921  
export PGDATA=/data01/pgdata  
export LANG=en_US.utf8  
export PGHOME=/home/postgres/pgsql9.4.6  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGHOST=$PGDATA  
export PGDATABASE=postgres  
export PGUSER=postgres  
alias rm='rm -i'  
alias ll='ls -lh'  
unalias vi  

postgres@digoal-> . ./env_pg.sh   
-bash: unalias: vi: not found  
postgres@digoal-> psql -V  
psql (PostgreSQL) 9.4.6  

以下插件如果没有用到可以不安装,否则需要手动安装,安装方法见相应的插件官网或者搜索我的blog:

postgis, plv8, plls, plcoffee, zhparser, pgrouting, rdkit, pg_hint_plan  

规划空间
目录空间至少要大于你所购买的RDS的容量规格,例如我买的是5G的实例,那么我本地的单个目录的空间要大于5GB。
PS,目前RDS PostgreSQL不支持自定义表空间,所以所有的数据都是放在默认表空间的,也即是需要单个目录的空间大于购买规格的空间的原因。将来如果RDS开放了创建表空间的权限,可以重新规划本地的目录。

[root@digoal ~]# df -h  
Filesystem      Size  Used Avail Use% Mounted on  
/dev/sda2        39G   22G   15G  61% /  
tmpfs           3.9G     0  3.9G   0% /dev/shm  
/dev/sdb         20G   44M   19G   1% /data01  

.2. 申请一个replication角色的用户
在阿里云管理控制台的右上方点击 工单服务 -> 提交工单 -> 关系型数据库RDS -> 直接提交工单,让客服创建一个有replication角色的用户。
PS,将来如果开放创建replication角色的API,就不需要提工单来申请账号了。

.3. 如果你需要将RDS复制到阿里云以外的主机,或者RDS和ECS在不同的可用区,则需要通过公网来连接。
那么需要用户配置RDS的公网地址,同样在阿里云管理控制台的RDS实例管理中可以申请公网地址。

.4. 在阿里云管理控制台的RDS 实例管理 -> 数据安全 配置白名单,测试连通性正常
例如备库的出口IP是固定的,则将这个IP添加到白名单,如果不是固定的IP,则需要添加0.0.0.0。

.5. 下载全量备份集
在控制台下载最近的一次全量备份集。

.6. 测试归档的下载接口是否正常
归档文件的下载需要通过调用API完成。
https://help.aliyun.com/document_detail/rds/OpenAPI-manual/RDS-OpenAPI-LogManagement/DescribeBinlogFiles.html?spm=5176.docrds/OpenAPI-manual/RDS-OpenAPI-BackupRecovery/DescribeBackups.6.217.tqV3VW
什么情况下需要用到API呢?当备库需要的XLOG文件已经被主库删除时。这种情况通常发生在自建的备库和主库网络异常,或者自建的备库由于某些原因停库后,长时间没有接收来自RDS PostgreSQL的XLOG,这些XLOG在RDS归档后就会从线上数据库的WAL日志中清除。
如果发现自建的备库报需要获取的XLOG不存在的错误,这个时候就需要从OSS下载归档了。
如果连OSS中都无法找到需要的归档,说明归档也清除了,那么就需要重建备库,回到第五步骤。

.7. 测试数据库的流复制连通性
请替换成您自己的RDS实例连接信息进行测试

postgres@digoal-> psql "replication=true" -h xxxx.pg.rds.aliyuncs.com -p 3433 -U digoal  
Password for user digoal:   
psql (9.4.6, server 9.4.1)  
Type "help" for help.  
postgres=> IDENTIFY_SYSTEM;  
      systemid       | timeline |  xlogpos   | dbname   
---------------------+----------+------------+--------  
 6165616856935119759 |        3 | 0/6B3A0180 |   
(1 row)  

.8. 配置postgresql.conf, recovery.conf
解压全量备份集到规划好的目录。
/data01/pgdata

配置 postgresql.conf
在文件末尾追加如下:

# add by digoal  
port=1921  
unix_socket_directories='.'  
tcp_keepalives_idle = 70  
tcp_keepalives_interval = 10  
tcp_keepalives_count = 10  
log_destination='csvlog'  
logging_collector=on  
log_truncate_on_rotation=on  
log_line_prefix = ''  
log_checkpoints = on  
log_connections = on  
log_disconnections = on  
log_error_verbosity = verbose  
hot_standby = on  
max_standby_archive_delay = 300s  
max_standby_streaming_delay = 300s  
wal_receiver_status_interval = 1s  
hot_standby_feedback = on  
log_statement='none'  
archive_mode=on  
archive_command = '/bin/date'  
track_io_timing=off  
listen_addresses='0.0.0.0'  

配置 recovery.conf
请替换成您自己的RDS实例连接信息

standby_mode = 'on'  
primary_conninfo = 'host=xxxxxx.pg.rds.aliyuncs.com user=digoal password=xxxx port=3433'  
recovery_target_timeline = 'latest'  

.9. 启动备库,检查是否同步

pg_ctl start  

RDS
postgres@digoal-> -h xxxx.pg.rds.aliyuncs.com -p 3433 -U digoal postgres
Type "help" for help.  
postgres=> create table test(id timestamp);  
postgres=> insert into test values (now());  
postgres=> update test set id=now() returning *;  
postgres=> \watch 1  

备库
postgres@digoal-> -h 127.0.0.1 -p 1921 -U digoal postgres
Type "help" for help.  
postgres=> select * from test;  
postgres=> \watch 1  

查看是否能同步

风险点评估
.1. 建议不要使用replication slot, 因为slot会导致主节点不删除XLOG, 从而可能因为网络堵塞,备库异常等无法实时接收XLOG的情况下导致主节点因为保留pg_xlog而把空间用满。

用户可以通过这种方法搭建自己的备库,在RDS没有提供异地容灾服务前,进行异地容灾。

PostgreSQL 百亿数据 秒级响应 正则及模糊查询

$
0
0

正则匹配和模糊匹配通常是搜索引擎的特长,但是如果你使用的是 PostgreSQL 数据库照样能实现,并且性能不赖,加上分布式方案 (譬如 plproxy, pg_shard, fdw shard, pg-xc, pg-xl, greenplum),处理百亿以上数据量的正则匹配和模糊匹配效果杠杠的,同时还不失数据库固有的功能,一举多得。

物联网中有大量的数据,除了数字数据,还有字符串类的数据,例如条形码,车牌,手机号,邮箱,姓名等等。
假设用户需要在大量的传感数据中进行模糊检索,甚至规则表达式匹配,有什么高效的方法呢?
这种场景还挺多,例如市面上发现了一批药品可能有问题,需要对药品条码进行规则表达式查找,找出复合条件的药品流向。
又比如在侦查行动时,线索的检索,如用户提供的残缺的电话号码,邮箱,车牌,IP地址,QQ号码,微信号码等等。
根据这些信息加上时间的叠加,模糊匹配和关联,最终找出罪犯。
可以看出,模糊匹配,正则表达式匹配,和人脸拼图有点类似,需求非常的迫切。

首先对应用场景进行一下分类,以及现有技术下能使用的优化手段。
.1. 带前缀的模糊查询,例如 like 'ABC%',在PG中也可以写成 ~ '^ABC'
可以使用btree索引优化,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

.2. 带后缀的模糊查询,例如 like '%ABC',在PG中也可以写成 ~ 'ABC$'
可以使用reverse函数btree索引,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

.3. 不带前缀和后缀的模糊查询,例如 like '%AB_C%',在PG中也可以写成 ~ 'AB.C'
可以使用pg_trgm的gin索引,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

.4. 正则表达式查询,例如 ~ '[\d]+def1.?[a|b|0|8]{1,3}'
可以使用pg_trgm的gin索引,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

PostgreSQL pg_trgm插件自从9.1开始支持模糊查询使用索引,从9.3开始支持规则表达式查询使用索引,大大提高了PostgreSQL在刑侦方面的能力。
代码见
https://github.com/postgrespro/pg_trgm_pro

pg_trgm插件的原理,将字符串前加2个空格,后加1个空格,组成一个新的字符串,并将这个新的字符串按照每3个相邻的字符拆分成多个token。
当使用规则表达式或者模糊查询进行匹配时,会检索出他们的近似度,再进行filter。
GIN索引的图例:
26721394885162976
从btree检索到匹配的token时,指向对应的list, 从list中存储的ctid找到对应的记录。
因为一个字符串会拆成很多个token,所以没插入一条记录,会更新多条索引,这也是GIN索引需要fastupdate的原因。
正则匹配是怎么做到的呢?
详见 https://raw.githubusercontent.com/postgrespro/pg_trgm_pro/master/trgm_regexp.c
实际上它是将正则表达式转换成了NFA格式,然后扫描多个TOKEN,进行bit and|or匹配。
正则组合如果转换出来的的bit and|or很多的话,就需要大量的recheck,性能也不能好到哪里去。

下面针对以上四种场景,实例讲解如何优化。

.1. 带前缀的模糊查询,例如 like 'ABC%',在PG中也可以写成 ~ '^ABC'
可以使用btree索引优化,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。
例子,1000万随机产生的MD5数据的前8个字符。

postgres=# create table tb(info text);  
CREATE TABLE  
postgres=# insert into tb select substring(md5(random()::text),1,8) from generate_series(1,10000000);  
INSERT 0 10000000  
postgres=# create index idx_tb on tb(info);  
CREATE INDEX  
postgres=# select * from tb limit 1;  
   info     
----------  
 376821ab  
(1 row)  
postgres=# explain select * from tb where info ~ '^376821' limit 10;  
                                  QUERY PLAN                                     
-------------------------------------------------------------------------------  
 Limit  (cost=0.43..0.52 rows=10 width=9)  
   ->  Index Only Scan using idx_tb on tb  (cost=0.43..8.46 rows=1000 width=9)  
         Index Cond: ((info >= '376821'::text) AND (info < '376822'::text))  
         Filter: (info ~ '^376821'::text)  
(4 rows)  
postgres=# select * from tb where info ~ '^376821' limit 10;  
   info     
----------  
 376821ab  
(1 row)  
Time: 0.536 ms  
postgres=# set enable_indexscan=off;  
SET  
Time: 1.344 ms  
postgres=# set enable_bitmapscan=off;  
SET  
Time: 0.158 ms  
postgres=# explain select * from tb where info ~ '^376821' limit 10;  
                           QUERY PLAN                             
----------------------------------------------------------------  
 Limit  (cost=0.00..1790.55 rows=10 width=9)  
   ->  Seq Scan on tb  (cost=0.00..179055.00 rows=1000 width=9)  
         Filter: (info ~ '^376821'::text)  
(3 rows)  
Time: 0.505 ms  

带前缀的模糊查询,不使用索引需要5483毫秒。
带前缀的模糊查询,使用索引只需要0.5毫秒。

postgres=# select * from tb where info ~ '^376821' limit 10;  
   info     
----------  
 376821ab  
(1 row)  
Time: 5483.655 ms  

.2. 带后缀的模糊查询,例如 like '%ABC',在PG中也可以写成 ~ 'ABC$'
可以使用reverse函数btree索引,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

postgres=# create index idx_tb1 on tb(reverse(info));  
CREATE INDEX  
postgres=# explain select * from tb where reverse(info) ~ '^ba128' limit 10;  
                                         QUERY PLAN                                           
--------------------------------------------------------------------------------------------  
 Limit  (cost=0.43..28.19 rows=10 width=9)  
   ->  Index Scan using idx_tb1 on tb  (cost=0.43..138778.43 rows=50000 width=9)  
         Index Cond: ((reverse(info) >= 'ba128'::text) AND (reverse(info) < 'ba129'::text))  
         Filter: (reverse(info) ~ '^ba128'::text)  
(4 rows)  

postgres=# select * from tb where reverse(info) ~ '^ba128' limit 10;  
   info     
----------  
 220821ab  
 671821ab  
 305821ab  
 e65821ab  
 536821ab  
 376821ab  
 668821ab  
 4d8821ab  
 26c821ab  
(9 rows)  
Time: 0.506 ms  

带后缀的模糊查询,使用索引只需要0.5毫秒。

.3. 不带前缀和后缀的模糊查询,例如 like '%AB_C%',在PG中也可以写成 ~ 'AB.C'
可以使用pg_trgm的gin索引,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

postgres=# create extension pg_trgm;  
postgres=# explain select * from tb where info ~ '5821a';  
                                 QUERY PLAN                                   
----------------------------------------------------------------------------  
 Bitmap Heap Scan on tb  (cost=103.75..3677.71 rows=1000 width=9)  
   Recheck Cond: (info ~ '5821a'::text)  
   ->  Bitmap Index Scan on idx_tb_2  (cost=0.00..103.50 rows=1000 width=0)  
         Index Cond: (info ~ '5821a'::text)  
(4 rows)  
Time: 0.647 ms  

postgres=# select * from tb where info ~ '5821a';  
   info     
----------  
 5821a8a3  
 945821af  
 45821a74  
 9fe5821a  
 5821a7e0  
 5821af2a  
 1075821a  
 e5821ac9  
 d265821a  
 45f5821a  
 df5821a4  
 de5821af  
 71c5821a  
 375821a3  
 fc5821af  
 5c5821ad  
 e65821ab  
 5821adde  
 c35821a6  
 5821a642  
 305821ab  
 5821a1c8  
 75821a5c  
 ce95821a  
 a65821ad  
(25 rows)  
Time: 3.808 ms  

前后模糊查询,使用索引只需要3.8毫秒。

.4. 正则表达式查询,例如 ~ '[\d]+def1.?[a|b|0|8]{1,3}'
可以使用pg_trgm的gin索引,或者拆列用多列索引叠加bit and或bit or进行优化(只适合固定长度的端字符串,例如char(8))。

前后模糊查询,使用索引只需要108毫秒。

postgres=# select * from tb where info ~ 'e65[\d]{2}a[b]{1,2}8' limit 10;  
   info     
----------  
 4e6567ab  
 1e6530ab  
 e6500ab8  
 ae6583ab  
 e6564ab7  
 5e6532ab  
 e6526abf  
 e6560ab6  
(8 rows)  
Time: 108.577 ms  

时间主要花费在排他上面。
检索了14794行,remove了14793行。大量的时间花费在无用功上,但是比全表扫还是好很多。

postgres=# explain (verbose,analyze,buffers,costs,timing) select * from tb where info ~ 'e65[\d]{2}a[b]{1,2}8' limit 10;  
                                                            QUERY PLAN                                                              
----------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=511.75..547.49 rows=10 width=9) (actual time=89.934..120.567 rows=1 loops=1)  
   Output: info  
   Buffers: shared hit=13054  
   ->  Bitmap Heap Scan on public.tb  (cost=511.75..4085.71 rows=1000 width=9) (actual time=89.930..120.562 rows=1 loops=1)  
         Output: info  
         Recheck Cond: (tb.info ~ 'e65[\d]{2}a[b]{1,2}8'::text)  
         Rows Removed by Index Recheck: 14793  
         Heap Blocks: exact=12929  
         Buffers: shared hit=13054  
         ->  Bitmap Index Scan on idx_tb_2  (cost=0.00..511.50 rows=1000 width=0) (actual time=67.589..67.589 rows=14794 loops=1)  
               Index Cond: (tb.info ~ 'e65[\d]{2}a[b]{1,2}8'::text)  
               Buffers: shared hit=125  
 Planning time: 0.493 ms  
 Execution time: 120.618 ms  
(14 rows)  
Time: 124.693 ms  

优化:
使用gin索引后,需要考虑性能问题,因为info字段被打散成了多个char(3)的token,从而涉及到非常多的索引条目,如果有非常高并发的插入,最好把gin_pending_list_limit设大,来提高插入效率,降低实时合并索引带来的RT升高。
使用了fastupdate后,会在每次vacuum表时,自动将pengding的信息合并到GIN索引中。
还有一点,查询不会有合并的动作,对于没有合并的GIN信息是使用遍历的方式搜索的。

压测高并发的性能:

create table tbl(id serial8, crt_time timestamp, sensorid int, sensorloc point, info text) with (autovacuum_enabled=on, autovacuum_vacuum_threshold=0.000001,autovacuum_vacuum_cost_delay=0);  
CREATE INDEX trgm_idx ON tbl USING GIN (info gin_trgm_ops) with (fastupdate='on', gin_pending_list_limit='6553600');  
alter sequence tbl_id_seq cache 10000;  

修改配置,让数据库的autovacuum快速迭代合并gin。

vi $PGDATA/postgresql.conf  
autovacuum_naptime=1s  
maintenance_work_mem=1GB  
autovacuum_work_mem=1GB  
autovacuum = on  
autovacuum_max_workers = 3  
log_autovacuum_min_duration = 0  
autovacuum_vacuum_cost_delay=0  

$ pg_ctl reload  

创建一个测试函数,用来产生随机的测试数据。

postgres=# create or replace function f() returns void as $$  
  insert into tbl (crt_time,sensorid,info) values ( clock_timestamp(),trunc(random()*500000),substring(md5(random()::text),1,8) );  
$$ language sql strict;  

vi test.sql  
select f();  

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 48 -j 48 -T 10000  

progress: 50.0 s, 52800.9 tps, lat 0.453 ms stddev 0.390  
progress: 51.0 s, 52775.8 tps, lat 0.453 ms stddev 0.398  
progress: 52.0 s, 53173.2 tps, lat 0.449 ms stddev 0.371  
progress: 53.0 s, 53010.0 tps, lat 0.451 ms stddev 0.390  
progress: 54.0 s, 53360.9 tps, lat 0.448 ms stddev 0.365  
progress: 55.0 s, 53285.0 tps, lat 0.449 ms stddev 0.362  
progress: 56.0 s, 53662.1 tps, lat 0.445 ms stddev 0.368  
progress: 57.0 s, 53283.8 tps, lat 0.448 ms stddev 0.385  
progress: 58.0 s, 53703.4 tps, lat 0.445 ms stddev 0.355  
progress: 59.0 s, 53818.7 tps, lat 0.444 ms stddev 0.344  
progress: 60.0 s, 53889.2 tps, lat 0.443 ms stddev 0.361  
progress: 61.0 s, 53613.8 tps, lat 0.446 ms stddev 0.355  
progress: 62.0 s, 53339.9 tps, lat 0.448 ms stddev 0.392  
progress: 63.0 s, 54014.9 tps, lat 0.442 ms stddev 0.346  
progress: 64.0 s, 53112.1 tps, lat 0.450 ms stddev 0.374  
progress: 65.0 s, 53706.1 tps, lat 0.445 ms stddev 0.367  
progress: 66.0 s, 53720.9 tps, lat 0.445 ms stddev 0.353  
progress: 67.0 s, 52858.1 tps, lat 0.452 ms stddev 0.415  
progress: 68.0 s, 53218.9 tps, lat 0.449 ms stddev 0.387  
progress: 69.0 s, 53403.0 tps, lat 0.447 ms stddev 0.377  
progress: 70.0 s, 53179.9 tps, lat 0.449 ms stddev 0.377  
progress: 71.0 s, 53232.4 tps, lat 0.449 ms stddev 0.373  
progress: 72.0 s, 53011.7 tps, lat 0.451 ms stddev 0.386  
progress: 73.0 s, 52685.1 tps, lat 0.454 ms stddev 0.384  
progress: 74.0 s, 52937.8 tps, lat 0.452 ms stddev 0.377  

按照这个速度,一天能支持超过40亿数据入库。

接下来对比一下字符串分离的例子,这个例子适用于字符串长度固定,并且很小的场景,如果字符串长度不固定,这种方法没用。
适用splict的方法,测试数据不尽人意,所以还是用pg_trgm比较靠谱。

postgres=# create table t_split(id int, crt_time timestamp, sensorid int, sensorloc point, info text, c1 char(1), c2 char(1), c3 char(1), c4 char(1), c5 char(1), c6 char(1), c7 char(1), c8 char(1));  
CREATE TABLE  
Time: 2.123 ms  

postgres=# insert into t_split(id,crt_time,sensorid,info,c1,c2,c3,c4,c5,c6,c7,c8) select id,ct,sen,info,substring(info,1,1),substring(info,2,1),substring(info,3,1),substring(info,4,1),substring(info,5,1),substring(info,6,1),substring(info,7,1),substring(info,8,1) from (select id, clock_timestamp() ct, trunc(random()*500000) sen, substring(md5(random()::text), 1, 8) info from generate_series(1,10000000) t(id)) t;  
INSERT 0 10000000  
Time: 81829.274 ms  

postgres=# create index idx1 on t_split (c1);  
postgres=# create index idx2 on t_split (c2);  
postgres=# create index idx3 on t_split (c3);  
postgres=# create index idx4 on t_split (c4);  
postgres=# create index idx5 on t_split (c5);  
postgres=# create index idx6 on t_split (c6);  
postgres=# create index idx7 on t_split (c7);  
postgres=# create index idx8 on t_split (c8);  
postgres=# create index idx9 on t_split using gin (info gin_trgm_ops);  

postgres=# select * from t_split limit 1;  
 id |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
----+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
  1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
(1 row)  

postgres=# select * from t_split where info ~ '^3[\d]?eed[\d]?79$' limit 10;  
 id |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
----+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
  1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
(1 row)  
Time: 133.041 ms  
postgres=# explain (analyze,verbose,timing,costs,buffers) select * from t_split where info ~ '^3[\d]?eed[\d]?79$' limit 10;  
                                                            QUERY PLAN                                                              
----------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=575.75..612.78 rows=10 width=57) (actual time=92.406..129.838 rows=1 loops=1)  
   Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
   Buffers: shared hit=13798  
   ->  Bitmap Heap Scan on public.t_split  (cost=575.75..4278.56 rows=1000 width=57) (actual time=92.403..129.833 rows=1 loops=1)  
         Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
         Recheck Cond: (t_split.info ~ '^3[\d]?eed[\d]?79$'::text)  
         Rows Removed by Index Recheck: 14690  
         Heap Blocks: exact=13669  
         Buffers: shared hit=13798  
         ->  Bitmap Index Scan on idx9  (cost=0.00..575.50 rows=1000 width=0) (actual time=89.576..89.576 rows=14691 loops=1)  
               Index Cond: (t_split.info ~ '^3[\d]?eed[\d]?79$'::text)  
               Buffers: shared hit=129  
 Planning time: 0.385 ms  
 Execution time: 129.883 ms  
(14 rows)  

Time: 130.678 ms  


postgres=# select * from t_split where c1='3' and c3='e' and c4='e' and c5='d' and c7='7' and c8='9' and c2 between '0' and '9' and c6 between '0' and '9' limit 10;  
 id |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
----+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
  1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
(1 row)  

Time: 337.367 ms  

postgres=# explain (analyze,verbose,timing,costs,buffers) select * from t_split where c1='3' and c3='e' and c4='e' and c5='d' and c7='7' and c8='9' and c2 between '0' and '9' and c6 between '0' and '9' limit 10;  
                                                                                                                 QUERY PLAN                                                                                                                   
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=33582.31..41499.35 rows=1 width=57) (actual time=339.230..344.675 rows=1 loops=1)  
   Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
   Buffers: shared hit=7581  
   ->  Bitmap Heap Scan on public.t_split  (cost=33582.31..41499.35 rows=1 width=57) (actual time=339.228..344.673 rows=1 loops=1)  
         Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
         Recheck Cond: ((t_split.c3 = 'e'::bpchar) AND (t_split.c8 = '9'::bpchar) AND (t_split.c5 = 'd'::bpchar))  
         Filter: ((t_split.c2 >= '0'::bpchar) AND (t_split.c2 <= '9'::bpchar) AND (t_split.c6 >= '0'::bpchar) AND (t_split.c6 <= '9'::bpchar) AND (t_split.c1 = '3'::bpchar) AND (t_split.c4 = 'e'::bpchar) AND (t_split.c7 = '7'::bpchar))  
         Rows Removed by Filter: 2480  
         Heap Blocks: exact=2450  
         Buffers: shared hit=7581  
         ->  BitmapAnd  (cost=33582.31..33582.31 rows=2224 width=0) (actual time=338.512..338.512 rows=0 loops=1)  
               Buffers: shared hit=5131  
               ->  Bitmap Index Scan on idx3  (cost=0.00..11016.93 rows=596333 width=0) (actual time=104.418..104.418 rows=624930 loops=1)  
                     Index Cond: (t_split.c3 = 'e'::bpchar)  
                     Buffers: shared hit=1711  
               ->  Bitmap Index Scan on idx8  (cost=0.00..11245.44 rows=608667 width=0) (actual time=100.185..100.185 rows=625739 loops=1)  
                     Index Cond: (t_split.c8 = '9'::bpchar)  
                     Buffers: shared hit=1712  
               ->  Bitmap Index Scan on idx5  (cost=0.00..11319.44 rows=612667 width=0) (actual time=99.480..99.480 rows=624269 loops=1)  
                     Index Cond: (t_split.c5 = 'd'::bpchar)  
                     Buffers: shared hit=1708  
 Planning time: 0.262 ms  
 Execution time: 344.731 ms  
(23 rows)  

Time: 346.424 ms  

postgres=# select * from t_split where info ~ '^33.+7.+9$' limit 10;  
   id   |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
--------+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
      1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
  24412 | 2016-03-02 09:58:04.186359 |   251599 |           | 33f07429 | 3  | 3  | f  | 0  | 7  | 4  | 2  | 9  
  24989 | 2016-03-02 09:58:04.191112 |   214569 |           | 334587d9 | 3  | 3  | 4  | 5  | 8  | 7  | d  | 9  
  50100 | 2016-03-02 09:58:04.398499 |   409819 |           | 33beb7b9 | 3  | 3  | b  | e  | b  | 7  | b  | 9  
  92623 | 2016-03-02 09:58:04.745372 |   280100 |           | 3373e719 | 3  | 3  | 7  | 3  | e  | 7  | 1  | 9  
 106054 | 2016-03-02 09:58:04.855627 |   155192 |           | 33c575c9 | 3  | 3  | c  | 5  | 7  | 5  | c  | 9  
 107070 | 2016-03-02 09:58:04.863827 |   464325 |           | 337dd729 | 3  | 3  | 7  | d  | d  | 7  | 2  | 9  
 135152 | 2016-03-02 09:58:05.088217 |   240500 |           | 336271d9 | 3  | 3  | 6  | 2  | 7  | 1  | d  | 9  
 156425 | 2016-03-02 09:58:05.25805  |   218202 |           | 333e7289 | 3  | 3  | 3  | e  | 7  | 2  | 8  | 9  
 170210 | 2016-03-02 09:58:05.368371 |   132530 |           | 33a8d789 | 3  | 3  | a  | 8  | d  | 7  | 8  | 9  
(10 rows)  

Time: 20.431 ms  

postgres=# explain (analyze,verbose,timing,costs,buffers) select * from t_split where info ~ '^33.+7.+9$' limit 10;  
                                                           QUERY PLAN                                                              
---------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=43.75..80.78 rows=10 width=57) (actual time=19.573..21.212 rows=10 loops=1)  
   Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
   Buffers: shared hit=566  
   ->  Bitmap Heap Scan on public.t_split  (cost=43.75..3746.56 rows=1000 width=57) (actual time=19.571..21.206 rows=10 loops=1)  
         Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
         Recheck Cond: (t_split.info ~ '^33.+7.+9$'::text)  
         Rows Removed by Index Recheck: 647  
         Heap Blocks: exact=552  
         Buffers: shared hit=566  
         ->  Bitmap Index Scan on idx9  (cost=0.00..43.50 rows=1000 width=0) (actual time=11.712..11.712 rows=39436 loops=1)  
               Index Cond: (t_split.info ~ '^33.+7.+9$'::text)  
               Buffers: shared hit=14  
 Planning time: 0.301 ms  
 Execution time: 21.255 ms  
(14 rows)  

Time: 21.995 ms  


postgres=# select * from t_split where c1='3' and c2='3' and c8='9' and (c4='7' or c5='7' or c6='7') limit 10;  
   id   |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
--------+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
      1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
  24412 | 2016-03-02 09:58:04.186359 |   251599 |           | 33f07429 | 3  | 3  | f  | 0  | 7  | 4  | 2  | 9  
  24989 | 2016-03-02 09:58:04.191112 |   214569 |           | 334587d9 | 3  | 3  | 4  | 5  | 8  | 7  | d  | 9  
  50100 | 2016-03-02 09:58:04.398499 |   409819 |           | 33beb7b9 | 3  | 3  | b  | e  | b  | 7  | b  | 9  
  92623 | 2016-03-02 09:58:04.745372 |   280100 |           | 3373e719 | 3  | 3  | 7  | 3  | e  | 7  | 1  | 9  
 106054 | 2016-03-02 09:58:04.855627 |   155192 |           | 33c575c9 | 3  | 3  | c  | 5  | 7  | 5  | c  | 9  
 107070 | 2016-03-02 09:58:04.863827 |   464325 |           | 337dd729 | 3  | 3  | 7  | d  | d  | 7  | 2  | 9  
 135152 | 2016-03-02 09:58:05.088217 |   240500 |           | 336271d9 | 3  | 3  | 6  | 2  | 7  | 1  | d  | 9  
 156425 | 2016-03-02 09:58:05.25805  |   218202 |           | 333e7289 | 3  | 3  | 3  | e  | 7  | 2  | 8  | 9  
 170210 | 2016-03-02 09:58:05.368371 |   132530 |           | 33a8d789 | 3  | 3  | a  | 8  | d  | 7  | 8  | 9  
(10 rows)  

Time: 37.739 ms  

postgres=# explain (analyze,verbose,timing,costs,buffers) select * from t_split where c1='3' and c2='3' and c8='9' and (c4='7' or c5='7' or c6='7') limit 10;  
                                                                                               QUERY PLAN                                                                                                  
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=0.00..8135.78 rows=10 width=57) (actual time=0.017..35.532 rows=10 loops=1)  
   Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
   Buffers: shared hit=1755  
   ->  Seq Scan on public.t_split  (cost=0.00..353093.00 rows=434 width=57) (actual time=0.015..35.526 rows=10 loops=1)  
         Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8  
         Filter: ((t_split.c1 = '3'::bpchar) AND (t_split.c2 = '3'::bpchar) AND (t_split.c8 = '9'::bpchar) AND ((t_split.c4 = '7'::bpchar) OR (t_split.c5 = '7'::bpchar) OR (t_split.c6 = '7'::bpchar)))  
         Rows Removed by Filter: 170200  
         Buffers: shared hit=1755  
 Planning time: 0.210 ms  
 Execution time: 35.572 ms  
(10 rows)  

Time: 36.260 ms  

postgres=# select * from t_split where info ~ '^3.?[b-g]+ed[\d]+79' order by info <-> '^3.?[b-g]+ed[\d]+79' limit 10;  
   id    |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
---------+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
       1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
 1308724 | 2016-03-02 09:58:14.590901 |   458822 |           | 3fed9479 | 3  | f  | e  | d  | 9  | 4  | 7  | 9  
 2866024 | 2016-03-02 09:58:27.20105  |   106467 |           | 3fed2279 | 3  | f  | e  | d  | 2  | 2  | 7  | 9  
 4826729 | 2016-03-02 09:58:42.907431 |   228023 |           | 3ded9879 | 3  | d  | e  | d  | 9  | 8  | 7  | 9  
 6113373 | 2016-03-02 09:58:53.211146 |   499702 |           | 36fed479 | 3  | 6  | f  | e  | d  | 4  | 7  | 9  
 1768237 | 2016-03-02 09:58:18.310069 |   345027 |           | 30fed079 | 3  | 0  | f  | e  | d  | 0  | 7  | 9  
 1472324 | 2016-03-02 09:58:15.913629 |   413283 |           | 3eed5798 | 3  | e  | e  | d  | 5  | 7  | 9  | 8  
 8319056 | 2016-03-02 09:59:10.902137 |   336740 |           | 3ded7790 | 3  | d  | e  | d  | 7  | 7  | 9  | 0  
 8576573 | 2016-03-02 09:59:12.962923 |   130223 |           | 3eed5793 | 3  | e  | e  | d  | 5  | 7  | 9  | 3  
(9 rows)  

Time: 268.661 ms  

postgres=# explain (analyze,verbose,timing,buffers,costs) select * from t_split where info ~ '^3.?[b-g]+ed[\d]+79' order by info <-> '^3.?[b-g]+ed[\d]+79' limit 10;  
                                                               QUERY PLAN                                                                  
-----------------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=4302.66..4302.69 rows=10 width=57) (actual time=269.214..269.217 rows=9 loops=1)  
   Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8, ((info <-> '^3.?[b-g]+ed[\d]+79'::text))  
   Buffers: shared hit=52606  
   ->  Sort  (cost=4302.66..4305.16 rows=1000 width=57) (actual time=269.212..269.212 rows=9 loops=1)  
         Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8, ((info <-> '^3.?[b-g]+ed[\d]+79'::text))  
         Sort Key: ((t_split.info <-> '^3.?[b-g]+ed[\d]+79'::text))  
         Sort Method: quicksort  Memory: 26kB  
         Buffers: shared hit=52606  
         ->  Bitmap Heap Scan on public.t_split  (cost=575.75..4281.06 rows=1000 width=57) (actual time=100.771..269.180 rows=9 loops=1)  
               Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8, (info <-> '^3.?[b-g]+ed[\d]+79'::text)  
               Recheck Cond: (t_split.info ~ '^3.?[b-g]+ed[\d]+79'::text)  
               Rows Removed by Index Recheck: 72929  
               Heap Blocks: exact=52479  
               Buffers: shared hit=52606  
               ->  Bitmap Index Scan on idx9  (cost=0.00..575.50 rows=1000 width=0) (actual time=88.062..88.062 rows=72938 loops=1)  
                     Index Cond: (t_split.info ~ '^3.?[b-g]+ed[\d]+79'::text)  
                     Buffers: shared hit=127  
 Planning time: 0.640 ms  
 Execution time: 269.287 ms  
(19 rows)  

Time: 270.430 ms  

postgres=# select * from t_split where info ~ '3.?[b-g]+ed[\d]+79' order by info <-> '3.?[b-g]+ed[\d]+79' limit 10;  
   id    |          crt_time          | sensorid | sensorloc |   info   | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8   
---------+----------------------------+----------+-----------+----------+----+----+----+----+----+----+----+----  
       1 | 2016-03-02 09:58:03.990639 |   161958 |           | 33eed779 | 3  | 3  | e  | e  | d  | 7  | 7  | 9  
 1308724 | 2016-03-02 09:58:14.590901 |   458822 |           | 3fed9479 | 3  | f  | e  | d  | 9  | 4  | 7  | 9  
 4826729 | 2016-03-02 09:58:42.907431 |   228023 |           | 3ded9879 | 3  | d  | e  | d  | 9  | 8  | 7  | 9  
 5250603 | 2016-03-02 09:58:46.300289 |   250582 |           | d3eed179 | d  | 3  | e  | e  | d  | 1  | 7  | 9  
 6113373 | 2016-03-02 09:58:53.211146 |   499702 |           | 36fed479 | 3  | 6  | f  | e  | d  | 4  | 7  | 9  
 1768237 | 2016-03-02 09:58:18.310069 |   345027 |           | 30fed079 | 3  | 0  | f  | e  | d  | 0  | 7  | 9  
 2866024 | 2016-03-02 09:58:27.20105  |   106467 |           | 3fed2279 | 3  | f  | e  | d  | 2  | 2  | 7  | 9  
 1472324 | 2016-03-02 09:58:15.913629 |   413283 |           | 3eed5798 | 3  | e  | e  | d  | 5  | 7  | 9  | 8  
 8576573 | 2016-03-02 09:59:12.962923 |   130223 |           | 3eed5793 | 3  | e  | e  | d  | 5  | 7  | 9  | 3  
 8319056 | 2016-03-02 09:59:10.902137 |   336740 |           | 3ded7790 | 3  | d  | e  | d  | 7  | 7  | 9  | 0  
(10 rows)  

Time: 320.525 ms  

postgres=# explain (analyze,verbose,buffers,costs,timing) select * from t_split where info ~ '3.?[b-g]+ed[\d]+79' order by info <-> '3.?[b-g]+ed[\d]+79' limit 10;  
                                                                QUERY PLAN                                                                  
------------------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=4302.66..4302.69 rows=10 width=57) (actual time=319.925..319.927 rows=10 loops=1)  
   Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8, ((info <-> '3.?[b-g]+ed[\d]+79'::text))  
   Buffers: shared hit=52602  
   ->  Sort  (cost=4302.66..4305.16 rows=1000 width=57) (actual time=319.923..319.923 rows=10 loops=1)  
         Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8, ((info <-> '3.?[b-g]+ed[\d]+79'::text))  
         Sort Key: ((t_split.info <-> '3.?[b-g]+ed[\d]+79'::text))  
         Sort Method: quicksort  Memory: 26kB  
         Buffers: shared hit=52602  
         ->  Bitmap Heap Scan on public.t_split  (cost=575.75..4281.06 rows=1000 width=57) (actual time=104.526..319.885 rows=10 loops=1)  
               Output: id, crt_time, sensorid, sensorloc, info, c1, c2, c3, c4, c5, c6, c7, c8, (info <-> '3.?[b-g]+ed[\d]+79'::text)  
               Recheck Cond: (t_split.info ~ '3.?[b-g]+ed[\d]+79'::text)  
               Rows Removed by Index Recheck: 72928  
               Heap Blocks: exact=52479  
               Buffers: shared hit=52602  
               ->  Bitmap Index Scan on idx9  (cost=0.00..575.50 rows=1000 width=0) (actual time=91.945..91.945 rows=72938 loops=1)  
                     Index Cond: (t_split.info ~ '3.?[b-g]+ed[\d]+79'::text)  
                     Buffers: shared hit=123  
 Planning time: 0.948 ms  
 Execution time: 320.003 ms  
(19 rows)  

Time: 321.502 ms  

大数据量性能测试:
模拟分区表,每小时一个分区,每小时数据量5000万,一天12亿,一个月360亿。

drop table tbl cascade;  
create table tbl (id serial8, crt_time timestamp, sensorid int, sensorloc point, info text);  

do language plpgsql $$  
declare  
  v_s timestamp := '2016-01-01 00:00:00';  
begin  
  for i in 1..720 loop  
    execute 'create table tbl_'||to_char(v_s,'yyyymmddhh24')||' (id int8 not null default nextval(''tbl_id_seq''::regclass), crt_time timestamp check (crt_time >= '''||to_char(v_s,'yyyy-mm-dd hh24:mi:ss')||''' and crt_time <'''||to_char(v_s+'1 h'::interval,'yyyy-mm-dd hh24:mi:ss')||'''), sensorid int, sensorloc point, info text) inherits (tbl)';  
    v_s := v_s + '1 h'::interval;  
  end loop;  
end;  
$$;  

alter sequence tbl_id_seq cache 100000;  

生成插入SQL

do language plpgsql $$  
declare  
  v_s timestamp := '2016-01-01 00:00:00';  
begin  
  for i in 1..720 loop  
    raise notice '%', 'psql -c "insert into tbl_'||to_char(v_s,'yyyymmddhh24')||' (crt_time, sensorid, info) select '''||to_char(v_s,'yyyy-mm-dd hh24:mi:ss')||''',trunc(random()*500000), substring(md5(random()::text),1,8) from generate_series(1,50000000);" &';  
    v_s := v_s + '1 h'::interval;  
  end loop;  
end;  
$$;  

性能指标, 范围扫描, 落到单表5000万的数据量内, 毫秒级返回.

postgres=# explain (analyze,verbose,timing,buffers,costs) select * from tbl where crt_time between '2016-01-01 12:00:00' and '2016-01-01 12:30:00' and info ~ 'f[\d]{2}e27e0$' order by info <-> 'f[\d]{2}e27e0$' limit 10;  
                                                                                                        QUERY PLAN                                                                                                           
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  
 Limit  (cost=18918.83..18918.85 rows=10 width=45) (actual time=350.296..350.297 rows=2 loops=1)  
   Output: tbl.id, tbl.crt_time, tbl.sensorid, tbl.sensorloc, tbl.info, ((tbl.info <-> 'f[\d]{2}e27e0$'::text))  
   Buffers: shared hit=4530  
   ->  Sort  (cost=18918.83..18931.33 rows=5001 width=45) (actual time=350.294..350.295 rows=2 loops=1)  
         Output: tbl.id, tbl.crt_time, tbl.sensorid, tbl.sensorloc, tbl.info, ((tbl.info <-> 'f[\d]{2}e27e0$'::text))  
         Sort Key: ((tbl.info <-> 'f[\d]{2}e27e0$'::text))  
         Sort Method: quicksort  Memory: 25kB  
         Buffers: shared hit=4530  
         ->  Result  (cost=0.00..18810.76 rows=5001 width=45) (actual time=347.995..350.279 rows=2 loops=1)  
               Output: tbl.id, tbl.crt_time, tbl.sensorid, tbl.sensorloc, tbl.info, (tbl.info <-> 'f[\d]{2}e27e0$'::text)  
               Buffers: shared hit=4530  
               ->  Append  (cost=0.00..18798.26 rows=5001 width=45) (actual time=347.976..350.254 rows=2 loops=1)  
                     Buffers: shared hit=4530  
                     ->  Seq Scan on public.tbl  (cost=0.00..0.00 rows=1 width=68) (actual time=0.001..0.001 rows=0 loops=1)  
                           Output: tbl.id, tbl.crt_time, tbl.sensorid, tbl.sensorloc, tbl.info  
                           Filter: ((tbl.crt_time >= '2016-01-01 12:00:00'::timestamp without time zone) AND (tbl.crt_time <= '2016-01-01 12:30:00'::timestamp without time zone) AND (tbl.info ~ 'f[\d]{2}e27e0$'::text))  
                     ->  Bitmap Heap Scan on public.tbl_2016010112  (cost=574.75..18798.26 rows=5000 width=45) (actual time=347.972..350.249 rows=2 loops=1)  
                           Output: tbl_2016010112.id, tbl_2016010112.crt_time, tbl_2016010112.sensorid, tbl_2016010112.sensorloc, tbl_2016010112.info  
                           Recheck Cond: (tbl_2016010112.info ~ 'f[\d]{2}e27e0$'::text)  
                           Rows Removed by Index Recheck: 4100  
                           Filter: ((tbl_2016010112.crt_time >= '2016-01-01 12:00:00'::timestamp without time zone) AND (tbl_2016010112.crt_time <= '2016-01-01 12:30:00'::timestamp without time zone))  
                           Heap Blocks: exact=4085  
                           Buffers: shared hit=4530  
                           ->  Bitmap Index Scan on idx_tbl_2016010112  (cost=0.00..573.50 rows=5000 width=0) (actual time=337.125..337.125 rows=4102 loops=1)  
                                 Index Cond: (tbl_2016010112.info ~ 'f[\d]{2}e27e0$'::text)  
                                 Buffers: shared hit=445  
 Planning time: 23.913 ms  
 Execution time: 350.383 ms  
(28 rows)  

postgres=# select * from tbl where crt_time between '2016-01-01 12:00:00' and '2016-01-01 12:30:00' and info ~ 'f[\d]{2}e27e0$' order by info <-> 'f[\d]{2}e27e0$' limit 10;  
     id     |      crt_time       | sensorid | sensorloc |   info     
------------+---------------------+----------+-----------+----------  
 1982100172 | 2016-01-01 12:00:00 |   336772 |           | f48e27e0  
 2292713691 | 2016-01-01 12:00:00 |   489110 |           | f77e27e0  
(2 rows)  

单表144亿的正则和模糊查询性能测试:

postgres=# \dt+ t_all  
                    List of relations  
 Schema | Name  | Type  |  Owner   |  Size  | Description   
--------+-------+-------+----------+--------+-------------  
 public | t_all | table | postgres | 811 GB |   
(1 row)  

postgres=# \d t_all  
                Table "public.t_all"  
  Column   |            Type             | Modifiers   
-----------+-----------------------------+-----------  
 id        | bigint                      | not null  
 crt_time  | timestamp without time zone |   
 sensorid  | integer                     |   
 sensorloc | point                       |   
 info      | text                        |   

postgres=# select count(*) from t_all;  
    count     
-------------  
 14456717312  
(1 row)  

postgres=# select * from t_all limit 10;  
     id     |      crt_time       | sensorid | sensorloc |   info     
------------+---------------------+----------+-----------+----------  
 6519272065 | 2016-01-06 10:00:00 |   493013 |           | 62255c83  
 6519272066 | 2016-01-06 10:00:00 |   309676 |           | f6c98800  
 6519272067 | 2016-01-06 10:00:00 |    43859 |           | 125a1191  
 6519272068 | 2016-01-06 10:00:00 |   495624 |           | e75cfd71  
 6519272069 | 2016-01-06 10:00:00 |    10362 |           | 7171f11c  
 6519272070 | 2016-01-06 10:00:00 |   231476 |           | 4201f809  
 6519272071 | 2016-01-06 10:00:00 |    43080 |           | a47e84e5  
 6519272072 | 2016-01-06 10:00:00 |   131292 |           | 17bc248e  
 6519272073 | 2016-01-06 10:00:00 |   486841 |           | 3303097c  
 6519272074 | 2016-01-06 10:00:00 |   491503 |           | f0c53fee  
(10 rows)  

测试数据后续放出,分表后做到秒级是没有问题的。信心从何而来呢?
因为瓶颈不在IO上,主要在数据的recheck, 把144亿数据拆分成29个5亿的表,并行执行,秒出是有可能的。

来看一个单表5亿的测试结果,秒出:

postgres=# explain (verbose,analyze,buffers,timing,costs) select * from tbl1 where info ~ 'aad.+f02' limit 10;
                                                               QUERY PLAN                                                                
-----------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=1439.79..1476.19 rows=10 width=29) (actual time=116.570..116.719 rows=10 loops=1)
   Output: id, crt_time, sensorid, info
   Buffers: shared hit=680
   ->  Bitmap Heap Scan on public.tbl1  (cost=1439.79..191054.88 rows=52103 width=29) (actual time=116.568..116.716 rows=10 loops=1)
         Output: id, crt_time, sensorid, info
         Recheck Cond: (tbl1.info ~ 'aad.+f02'::text)
         Rows Removed by Index Recheck: 38
         Heap Blocks: exact=48
         Buffers: shared hit=680
         ->  Bitmap Index Scan on tbl1_info_idx  (cost=0.00..1426.77 rows=52103 width=0) (actual time=116.495..116.495 rows=403 loops=1)
               Index Cond: (tbl1.info ~ 'aad.+f02'::text)
               Buffers: shared hit=632
 Planning time: 0.311 ms
 Execution time: 116.754 ms
(14 rows)

Time: 117.422 ms
postgres=# select * from tbl1 where info ~ 'aad.+f02' limit 10;
    id     |          crt_time          | sensorid |   info   
-----------+----------------------------+----------+----------
  17986922 | 2016-02-29 17:42:42.427639 |    75863 | aad3f02a
  19873247 | 2016-02-29 17:43:16.714945 |   174971 | 2aad5f02
  23798336 | 2016-02-29 17:44:35.369654 |   202085 | aad06f02
  28630866 | 2016-02-29 17:46:03.544462 |   463184 | baad3f02
  31458529 | 2016-02-29 17:47:00.300937 |   411670 | aad1af02
  52009687 | 2016-02-29 17:53:15.466246 |   192821 | 5aad6f02
  80769909 | 2016-02-29 18:01:31.074248 |    47993 | aadcf029
  80825896 | 2016-02-29 18:01:31.039063 |   284712 | aad14f02
  83385996 | 2016-02-29 18:02:12.699317 |    78233 | daadcf02
 102814316 | 2016-02-29 18:08:20.891412 |   359635 | aad06f02
(10 rows)

Time: 116.901 ms

全表扫描需要,

postgres=# set enable_bitmapscan=off;
SET
Time: 0.145 ms
postgres=# select * from tbl1 where info ~ 'aad.+f02' limit 10;
    id     |          crt_time          | sensorid |   info   
-----------+----------------------------+----------+----------
  31458529 | 2016-02-29 17:47:00.300937 |   411670 | aad1af02
  52009687 | 2016-02-29 17:53:15.466246 |   192821 | 5aad6f02
  80769909 | 2016-02-29 18:01:31.074248 |    47993 | aadcf029
  80825896 | 2016-02-29 18:01:31.039063 |   284712 | aad14f02
  83385996 | 2016-02-29 18:02:12.699317 |    78233 | daadcf02
 102814316 | 2016-02-29 18:08:20.891412 |   359635 | aad06f02
 105236847 | 2016-02-29 18:09:56.914795 |      876 | aadbf026
 108524272 | 2016-02-29 18:10:47.39312  |   338071 | 2aad2f02
 128169786 | 2016-02-29 18:17:52.105948 |   262400 | aad0f028
 135935810 | 2016-02-29 18:20:43.265139 |   487673 | aad7f021
(10 rows)
Time: 98903.073 ms

性能对比图表:
1000万数据对比
_
5亿数据对比
_1
1000万数据btree bit or|and与gin对比
_2
144亿分区表对比
_3

大数据量的优化方法,例如百亿级别以上的数据量,如何能做到秒级的模糊查询响应。
对于单机,可以使用分区,同时使用并行查询,充分使用CPU的功能。
或者使用MPP, SHARDING架构,利用多机的资源。

原则,减少recheck,尽量扫描搜索到最终需要的结果(大量扫描,大量remove checked false row, 全表和索引都存在这种现象)

使用gin和btree索引同时来解决,精确检索,前缀检索,后缀检索,前后模糊,正则匹配的所有场景的问题。

PostgreSQL 9.5 新功能 create database ALLOW_CONNECTIONS

$
0
0
使用PostgreSQL的朋友一定遇到过一个非常苦恼的问题,刚创建好的库,任何人都能访问。
例如:
postgres=# revoke all on database template0 from public;
REVOKE
postgres=# revoke all on database template1 from public;
REVOKE

postgres=# create role test1 login;
CREATE ROLE
postgres=# create role test login;
CREATE ROLE

postgres=# \c template0 test;
FATAL:  database "template0" is not currently accepting connections
Previous connection kept
postgres=# \c template1 test;
FATAL:  permission denied for database "template1"
DETAIL:  User does not have CONNECT privilege.
Previous connection kept

postgres=# create database test owner test template template0;
CREATE DATABASE
postgres=# create database test1 owner test1 template template0;
CREATE DATABASE
postgres=# \c test test1
You are now connected to database "test" as user "test1".
test=> \q

原因分析:
数据库在创建好之后,默认会给予public角色的连接权限。
我们在创建好数据库之后必须回收这个权限,这样除了超级用户以及这个数据库的owner之外的其他用户就不能连接了。
postgres=# revoke all on database test from public;
REVOKE
owner 和超级用户能连
postgres=# \c test test
You are now connected to database "test" as user "test".

其他普通用户不能连
test=> \c test test1
FATAL:  permission denied for database "test"
DETAIL:  User does not have CONNECT privilege.
Previous connection kept

其他普通用户要连接,必须先赋予权限。
test=> grant connect on database test to test1;
GRANT
test=> \c test test1
You are now connected to database "test" as user "test1".
test=> 

来谈谈9.5的新特性:
CREATE DATABASE name
    [ [ WITH ] [ OWNER [=] user_name ]
           [ TEMPLATE [=] template ]
           [ ENCODING [=] encoding ]
           [ LC_COLLATE [=] lc_collate ]
           [ LC_CTYPE [=] lc_ctype ]
           [ TABLESPACE [=] tablespace_name ]
           [ ALLOW_CONNECTIONS [=] allowconn ]
           [ CONNECTION LIMIT [=] connlimit ] ]
           [ IS_TEMPLATE [=] istemplate ]
allow_connections=false,并不是为了实现和以上同样的功能。
只是建一个库,不让任何人连接。
这个功能目前还没有想出有什么好的应用场景,只是对应了pg_database.datallowconn字段。
例如:
postgres=# create database test11 owner test allow_connections=false;
CREATE DATABASE
postgres=# \c test11 test
FATAL:  database "test11" is not currently accepting connections
Previous connection kept
postgres=# \c test11 postgres
FATAL:  database "test11" is not currently accepting connections
Previous connection kept
postgres=# grant connect on database test11 to test;
GRANT
postgres=# \c test11 test
FATAL:  database "test11" is not currently accepting connections
Previous connection kept
postgres=# grant all on database test11 to test;
GRANT
postgres=# \c test11 test
FATAL:  database "test11" is not currently accepting connections
Previous connection kept

PostgreSQL 用CPU "硬解码" 提升1倍 数值运算能力 助力金融大数据量计算

$
0
0

PostgreSQL 支持的数字类型包括整型,浮点,以及PG自己实现的numeric数据类型。

src/backend/utils/adt/numeric.c  
src/backend/utils/adt/float.c  

numeric可以存储非常大的数字,超过2^17次方个数字长度。提升了精度的同时,也带来了性能的损耗,不能充分利用CPU 的 “硬解码”能力。

typedef struct NumericVar  
{  
        int                     ndigits;                /* # of digits in digits[] - can be 0! */  
        int                     weight;                 /* weight of first digit */  
        int                     sign;                   /* NUMERIC_POS, NUMERIC_NEG, or NUMERIC_NAN */  
        int                     dscale;                 /* display scale */  
        NumericDigit *buf;                      /* start of palloc'd space for digits[] */  
        NumericDigit *digits;           /* base-NBASE digits */  
} NumericVar;  

浮点类型就比numeric轻量很多,所以性能也会好很多,一倍左右。
在大数据的场合中,节约1倍的计算量是很可观的哦,特别是在金融行业,涉及到大量的数值计算。
如果你玩过greenplum, deepgreen, vitessedb ,也能发现在这些数据库产品的测试手册中,会提到使用money, float8类型来替换原有的numeric类型来进行测试。可以得到更好的性能。
但是money, float8始终是有一定的弊端的,超出精度时,结果可能不准确。
那么怎样提升numeric的性能又不会得到有误的结果呢?
我们可以使用fexeddecimal插件,如下:
https://github.com/2ndQuadrant/fixeddecimal

fixeddecimal的原理很简单,实际上它是使用int8来存储的,整数位和小数位是在代码中固定的:

/*  
 * The scale which the number is actually stored.  
 * For example: 100 will allow 2 decimal places of precision  
 * This must always be a '1' followed by a number of '0's.  
 */  
#define FIXEDDECIMAL_MULTIPLIER 100LL  

/*  
 * Number of decimal places to store.  
 * This number should be the number of decimal digits that it takes to  
 * represent FIXEDDECIMAL_MULTIPLIER - 1  
 */  
#define FIXEDDECIMAL_SCALE 2  

如果 FIXEDDECIMAL_SCALE 设置为2,则FIXEDDECIMAL_MULTIPLIER 设置为100,如果 FIXEDDECIMAL_SCALE 设置为3,FIXEDDECIMAL_MULTIPLIER 设置为1000。
也就是通过整型来存储,显示时除以multiplier得到整数部分,取余得到小数部分。

/*  
 * fixeddecimal2str  
 *              Prints the fixeddecimal 'val' to buffer as a string.  
 *              Returns a pointer to the end of the written string.  
 */  
static char *  
fixeddecimal2str(int64 val, char *buffer)  
{  
        char       *ptr = buffer;  
        int64           integralpart = val / FIXEDDECIMAL_MULTIPLIER;  
        int64           fractionalpart = val % FIXEDDECIMAL_MULTIPLIER;  

        if (val < 0)  
        {  
                fractionalpart = -fractionalpart;  

                /*  
                 * Handle special case for negative numbers where the intergral part  
                 * is zero. pg_int64tostr() won't prefix with "-0" in this case, so  
                 * we'll do it manually  
                 */  
                if (integralpart == 0)  
                        *ptr++ = '-';  
        }  
        ptr = pg_int64tostr(ptr, integralpart);  
        *ptr++ = '.';  
        ptr = pg_int64tostr_zeropad(ptr, fractionalpart, FIXEDDECIMAL_SCALE);  
        return ptr;  
}  

所以fixeddecimal能存取的值范围就是INT8的范围除以multiplier。

postgres=# select 9223372036854775807::int8;  
        int8           
---------------------  
 9223372036854775807  
(1 row)  

postgres=# select 9223372036854775808::int8;  
ERROR:  22003: bigint out of range  
LOCATION:  numeric_int8, numeric.c:2955  

postgres=# select 92233720368547758.07::fixeddecimal;  
     fixeddecimal       
----------------------  
 92233720368547758.07  
(1 row)  

postgres=# select 92233720368547758.08::fixeddecimal;  
ERROR:  22003: value "92233720368547758.08" is out of range for type fixeddecimal  
LOCATION:  scanfixeddecimal, fixeddecimal.c:499  

另外需要注意,编译fixeddecimal需要用到支持__int128的编译器,gcc 4.9.3是支持的。所以如果你用的gcc版本比较低的话,需要提前更新好gcc。
http://blog.163.com/digoal@126/blog/static/163877040201601313814429/

下面测试一下fixeddecimal+PostgreSQL 9.5的性能表现,对1亿数据进行加减乘除以及聚合的运算,看float8, numeric, fixeddecimal类型的运算结果和速度:
使用auto_explain记录下对比float8,numeric,fixeddecimal的执行计划和耗时。

psql
\timing  

postgres=# load 'auto_explain';  
LOAD  
Time: 2.328 ms  

postgres=# set auto_explain.log_analyze =true;  
SET  
Time: 0.115 ms  
postgres=# set auto_explain.log_buffers =true;  
SET  
Time: 0.080 ms  
postgres=# set auto_explain.log_nested_statements=true;  
SET  
Time: 0.073 ms  
postgres=# set auto_explain.log_timing=true;  
SET  
Time: 0.089 ms  
postgres=# set auto_explain.log_triggers=true;  
SET  
Time: 0.076 ms  
postgres=# set auto_explain.log_verbose=true;  
SET  
Time: 0.074 ms  
postgres=# set auto_explain.log_min_duration=0;  
SET  
Time: 0.149 ms  
postgres=# set client_min_messages ='log';  
SET  
Time: 0.144 ms  

postgres=# set work_mem='8GB';  
SET  
Time: 0.152 ms  

postgres=# select sum(i::numeric),min(i::numeric),max(i::numeric),avg(i::numeric),sum(3.0::numeric*(i::numeric+i::numeric)),avg(i::numeric/3.0::numeric) from generate_series(1,100000000) t(i);  
LOG:  duration: 241348.655 ms  plan:  
Query Text: select sum(i::numeric),min(i::numeric),max(i::numeric),avg(i::numeric),sum(3.0::numeric*(i::numeric+i::numeric)),avg(i::numeric/3.0::numeric) from generate_series(1,100000000) t(i);  
Aggregate  (cost=50.01..50.02 rows=1 width=4) (actual time=241348.631..241348.631 rows=1 loops=1)  
  Output: sum((i)::numeric), min((i)::numeric), max((i)::numeric), avg((i)::numeric), sum((3.0 * ((i)::numeric + (i)::numeric))), avg(((i)::numeric / 3.0))  
  ->  Function Scan on pg_catalog.generate_series t  (cost=0.00..10.00 rows=1000 width=4) (actual time=12200.116..22265.586 rows=100000000 loops=1)  
        Output: i  
        Function Call: generate_series(1, 100000000)  
       sum        | min |    max    |          avg          |         sum         |              avg                
------------------+-----+-----------+-----------------------+---------------------+-------------------------------  
 5000000050000000 |   1 | 100000000 | 50000000.500000000000 | 30000000300000000.0 | 16666666.83333333333333333333  
(1 row)  

Time: 243149.286 ms  

postgres=# select sum(i::float8),min(i::float8),max(i::float8),avg(i::float8),sum(3.0::float8*(i::float8+i::float8)),avg(i::float8/3.0::float8) from generate_series(1,100000000) t(i);  
LOG:  duration: 112407.004 ms  plan:  
Query Text: select sum(i::float8),min(i::float8),max(i::float8),avg(i::float8),sum(3.0::float8*(i::float8+i::float8)),avg(i::float8/3.0::float8) from generate_series(1,100000000) t(i);  
Aggregate  (cost=50.01..50.02 rows=1 width=4) (actual time=112406.967..112406.967 rows=1 loops=1)  
  Output: sum((i)::double precision), min((i)::double precision), max((i)::double precision), avg((i)::double precision), sum(('3'::double precision * ((i)::double precision + (i)::double precision))), avg(((i)::double precision / '3'::double precision))  
  ->  Function Scan on pg_catalog.generate_series t  (cost=0.00..10.00 rows=1000 width=4) (actual time=12157.571..20994.444 rows=100000000 loops=1)  
        Output: i  
        Function Call: generate_series(1, 100000000)  
      sum       | min |    max    |    avg     |         sum          |       avg          
----------------+-----+-----------+------------+----------------------+------------------  
 5.00000005e+15 |   1 | 100000000 | 50000000.5 | 3.00000003225094e+16 | 16666666.8333333  
(1 row)  

Time: 114208.528 ms  

postgres=# select sum(i::fixeddecimal),min(i::fixeddecimal),max(i::fixeddecimal),avg(i::fixeddecimal),sum(3.0::fixeddecimal*(i::fixeddecimal+i::fixeddecimal)),avg(i::fixeddecimal/3.0::fixeddecimal) from generate_series(1,100000000) t(i);  
LOG:  duration: 97956.458 ms  plan:  
Query Text: select sum(i::fixeddecimal),min(i::fixeddecimal),max(i::fixeddecimal),avg(i::fixeddecimal),sum(3.0::fixeddecimal*(i::fixeddecimal+i::fixeddecimal)),avg(i::fixeddecimal/3.0::fixeddecimal) from generate_series(1,100000000) t(i);  
Aggregate  (cost=50.01..50.02 rows=1 width=4) (actual time=97956.431..97956.431 rows=1 loops=1)  
  Output: sum((i)::fixeddecimal), min((i)::fixeddecimal), max((i)::fixeddecimal), avg((i)::fixeddecimal), sum(('3.00'::fixeddecimal * ((i)::fixeddecimal + (i)::fixeddecimal))), avg(((i)::fixeddecimal / '3.00'::fixeddecimal))  
  ->  Function Scan on pg_catalog.generate_series t  (cost=0.00..10.00 rows=1000 width=4) (actual time=12168.630..20874.617 rows=100000000 loops=1)  
        Output: i  
        Function Call: generate_series(1, 100000000)  
         sum         | min  |     max      |     avg     |         sum          |     avg       
---------------------+------+--------------+-------------+----------------------+-------------  
 5000000050000000.00 | 1.00 | 100000000.00 | 50000000.50 | 30000000300000000.00 | 16666666.83  
(1 row)  

Time: 99763.032 ms  

性能对比:
_

注意上面的测试case,
float8的结果已经不准确了,fixeddecimal使用了默认的scale=2,所以小数位保持2位精度。
numeric则精度更高,显示的部分没有显示全,这是PG内部控制的。
另外需要注意的是,fixeddecimal对于超出精度的部分是做的截断,不是round, 因此123.555是存的12355而不是12356。

postgres=# select '123.555'::fixeddecimal;  
 fixeddecimal   
--------------  
 123.55  
(1 row)  

postgres=# select '123.555'::fixeddecimal/'123.556'::fixeddecimal;  
 ?column?   
----------  
 1.00  
(1 row)  

postgres=# select '124.555'::fixeddecimal/'123.556'::fixeddecimal;  
 ?column?   
----------  
 1.00  
(1 row)  

postgres=# select 124.555/123.556;  
      ?column?        
--------------------  
 1.0080854025704944  
(1 row)  
Viewing all 253 articles
Browse latest View live