CK数据库

发布时间：2023年12月19日

clickhouse常用

CREATE DATABASE IF NOT EXISTS tutorial;
show databases;

CREATE TABLE tutorial.hits_v1
(
    `WatchID` UInt64,
    `JavaEnable` UInt8,
    `Title` String,
    `GoodEvent` Int16,
    `EventTime` DateTime,
    `EventDate` Date,
    `CounterID` UInt32,
    `ClientIP` UInt32,
    `ClientIP6` FixedString(16),
    `RegionID` UInt32,
    `UserID` UInt64,
    `CounterClass` Int8,
    `OS` UInt8,
    `UserAgent` UInt8,
    `URL` String,
    `Referer` String,
    `URLDomain` String,
    `RefererDomain` String,
    `Refresh` UInt8,
    `IsRobot` UInt8,
    `RefererCategories` Array(UInt16),
    `URLCategories` Array(UInt16),
    `URLRegions` Array(UInt32),
    `RefererRegions` Array(UInt32),
    `ResolutionWidth` UInt16,
    `ResolutionHeight` UInt16,
    `ResolutionDepth` UInt8,
    `FlashMajor` UInt8,
    `FlashMinor` UInt8,
    `FlashMinor2` String,
    `NetMajor` UInt8,
    `NetMinor` UInt8,
    `UserAgentMajor` UInt16,
    `UserAgentMinor` FixedString(2),
    `CookieEnable` UInt8,
    `JavascriptEnable` UInt8,
    `IsMobile` UInt8,
    `MobilePhone` UInt8,
    `MobilePhoneModel` String,
    `Params` String,
    `IPNetworkID` UInt32,
    `TraficSourceID` Int8,
    `SearchEngineID` UInt16,
    `SearchPhrase` String,
    `AdvEngineID` UInt8,
    `IsArtifical` UInt8,
    `WindowClientWidth` UInt16,
    `WindowClientHeight` UInt16,
    `ClientTimeZone` Int16,
    `ClientEventTime` DateTime,
    `SilverlightVersion1` UInt8,
    `SilverlightVersion2` UInt8,
    `SilverlightVersion3` UInt32,
    `SilverlightVersion4` UInt16,
    `PageCharset` String,
    `CodeVersion` UInt32,
    `IsLink` UInt8,
    `IsDownload` UInt8,
    `IsNotBounce` UInt8,
    `FUniqID` UInt64,
    `HID` UInt32,
    `IsOldCounter` UInt8,
    `IsEvent` UInt8,
    `IsParameter` UInt8,
    `DontCountHits` UInt8,
    `WithHash` UInt8,
    `HitColor` FixedString(1),
    `UTCEventTime` DateTime,
    `Age` UInt8,
    `Sex` UInt8,
    `Income` UInt8,
    `Interests` UInt16,
    `Robotness` UInt8,
    `GeneralInterests` Array(UInt16),
    `RemoteIP` UInt32,
    `RemoteIP6` FixedString(16),
    `WindowName` Int32,
    `OpenerName` Int32,
    `HistoryLength` Int16,
    `BrowserLanguage` FixedString(2),
    `BrowserCountry` FixedString(2),
    `SocialNetwork` String,
    `SocialAction` String,
    `HTTPError` UInt16,
    `SendTiming` Int32,
    `DNSTiming` Int32,
    `ConnectTiming` Int32,
    `ResponseStartTiming` Int32,
    `ResponseEndTiming` Int32,
    `FetchTiming` Int32,
    `RedirectTiming` Int32,
    `DOMInteractiveTiming` Int32,
    `DOMContentLoadedTiming` Int32,
    `DOMCompleteTiming` Int32,
    `LoadEventStartTiming` Int32,
    `LoadEventEndTiming` Int32,
    `NSToDOMContentLoadedTiming` Int32,
    `FirstPaintTiming` Int32,
    `RedirectCount` Int8,
    `SocialSourceNetworkID` UInt8,
    `SocialSourcePage` String,
    `ParamPrice` Int64,
    `ParamOrderID` String,
    `ParamCurrency` FixedString(3),
    `ParamCurrencyID` UInt16,
    `GoalsReached` Array(UInt32),
    `OpenstatServiceName` String,
    `OpenstatCampaignID` String,
    `OpenstatAdID` String,
    `OpenstatSourceID` String,
    `UTMSource` String,
    `UTMMedium` String,
    `UTMCampaign` String,
    `UTMContent` String,
    `UTMTerm` String,
    `FromTag` String,
    `HasGCLID` UInt8,
    `RefererHash` UInt64,
    `URLHash` UInt64,
    `CLID` UInt32,
    `YCLID` UInt64,
    `ShareService` String,
    `ShareURL` String,
    `ShareTitle` String,
    `ParsedParams` Nested(
        Key1 String,
        Key2 String,
        Key3 String,
        Key4 String,
        Key5 String,
        ValueDouble Float64),
    `IslandID` FixedString(16),
    `RequestNum` UInt32,
    `RequestTry` UInt8
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID);

CREATE TABLE tutorial.visits_v1
(
    `CounterID` UInt32,
    `StartDate` Date,
    `Sign` Int8,
    `IsNew` UInt8,
    `VisitID` UInt64,
    `UserID` UInt64,
    `StartTime` DateTime,
    `Duration` UInt32,
    `UTCStartTime` DateTime,
    `PageViews` Int32,
    `Hits` Int32,
    `IsBounce` UInt8,
    `Referer` String,
    `StartURL` String,
    `RefererDomain` String,
    `StartURLDomain` String,
    `EndURL` String,
    `LinkURL` String,
    `IsDownload` UInt8,
    `TraficSourceID` Int8,
    `SearchEngineID` UInt16,
    `SearchPhrase` String,
    `AdvEngineID` UInt8,
    `PlaceID` Int32,
    `RefererCategories` Array(UInt16),
    `URLCategories` Array(UInt16),
    `URLRegions` Array(UInt32),
    `RefererRegions` Array(UInt32),
    `IsYandex` UInt8,
    `GoalReachesDepth` Int32,
    `GoalReachesURL` Int32,
    `GoalReachesAny` Int32,
    `SocialSourceNetworkID` UInt8,
    `SocialSourcePage` String,
    `MobilePhoneModel` String,
    `ClientEventTime` DateTime,
    `RegionID` UInt32,
    `ClientIP` UInt32,
    `ClientIP6` FixedString(16),
    `RemoteIP` UInt32,
    `RemoteIP6` FixedString(16),
    `IPNetworkID` UInt32,
    `SilverlightVersion3` UInt32,
    `CodeVersion` UInt32,
    `ResolutionWidth` UInt16,
    `ResolutionHeight` UInt16,
    `UserAgentMajor` UInt16,
    `UserAgentMinor` UInt16,
    `WindowClientWidth` UInt16,
    `WindowClientHeight` UInt16,
    `SilverlightVersion2` UInt8,
    `SilverlightVersion4` UInt16,
    `FlashVersion3` UInt16,
    `FlashVersion4` UInt16,
    `ClientTimeZone` Int16,
    `OS` UInt8,
    `UserAgent` UInt8,
    `ResolutionDepth` UInt8,
    `FlashMajor` UInt8,
    `FlashMinor` UInt8,
    `NetMajor` UInt8,
    `NetMinor` UInt8,
    `MobilePhone` UInt8,
    `SilverlightVersion1` UInt8,
    `Age` UInt8,
    `Sex` UInt8,
    `Income` UInt8,
    `JavaEnable` UInt8,
    `CookieEnable` UInt8,
    `JavascriptEnable` UInt8,
    `IsMobile` UInt8,
    `BrowserLanguage` UInt16,
    `BrowserCountry` UInt16,
    `Interests` UInt16,
    `Robotness` UInt8,
    `GeneralInterests` Array(UInt16),
    `Params` Array(String),
    `Goals` Nested(
        ID UInt32,
        Serial UInt32,
        EventTime DateTime,
        Price Int64,
        OrderID String,
        CurrencyID UInt32),
    `WatchIDs` Array(UInt64),
    `ParamSumPrice` Int64,
    `ParamCurrency` FixedString(3),
    `ParamCurrencyID` UInt16,
    `ClickLogID` UInt64,
    `ClickEventID` Int32,
    `ClickGoodEvent` Int32,
    `ClickEventTime` DateTime,
    `ClickPriorityID` Int32,
    `ClickPhraseID` Int32,
    `ClickPageID` Int32,
    `ClickPlaceID` Int32,
    `ClickTypeID` Int32,
    `ClickResourceID` Int32,
    `ClickCost` UInt32,
    `ClickClientIP` UInt32,
    `ClickDomainID` UInt32,
    `ClickURL` String,
    `ClickAttempt` UInt8,
    `ClickOrderID` UInt32,
    `ClickBannerID` UInt32,
    `ClickMarketCategoryID` UInt32,
    `ClickMarketPP` UInt32,
    `ClickMarketCategoryName` String,
    `ClickMarketPPName` String,
    `ClickAWAPSCampaignName` String,
    `ClickPageName` String,
    `ClickTargetType` UInt16,
    `ClickTargetPhraseID` UInt64,
    `ClickContextType` UInt8,
    `ClickSelectType` Int8,
    `ClickOptions` String,
    `ClickGroupBannerID` Int32,
    `OpenstatServiceName` String,
    `OpenstatCampaignID` String,
    `OpenstatAdID` String,
    `OpenstatSourceID` String,
    `UTMSource` String,
    `UTMMedium` String,
    `UTMCampaign` String,
    `UTMContent` String,
    `UTMTerm` String,
    `FromTag` String,
    `HasGCLID` UInt8,
    `FirstVisit` DateTime,
    `PredLastVisit` Date,
    `LastVisit` Date,
    `TotalVisits` UInt32,
    `TraficSource` Nested(
        ID Int8,
        SearchEngineID UInt16,
        AdvEngineID UInt8,
        PlaceID UInt16,
        SocialSourceNetworkID UInt8,
        Domain String,
        SearchPhrase String,
        SocialSourcePage String),
    `Attendance` FixedString(16),
    `CLID` UInt32,
    `YCLID` UInt64,
    `NormalizedRefererHash` UInt64,
    `SearchPhraseHash` UInt64,
    `RefererDomainHash` UInt64,
    `NormalizedStartURLHash` UInt64,
    `StartURLDomainHash` UInt64,
    `NormalizedEndURLHash` UInt64,
    `TopLevelDomain` UInt64,
    `URLScheme` UInt64,
    `OpenstatServiceNameHash` UInt64,
    `OpenstatCampaignIDHash` UInt64,
    `OpenstatAdIDHash` UInt64,
    `OpenstatSourceIDHash` UInt64,
    `UTMSourceHash` UInt64,
    `UTMMediumHash` UInt64,
    `UTMCampaignHash` UInt64,
    `UTMContentHash` UInt64,
    `UTMTermHash` UInt64,
    `FromHash` UInt64,
    `WebVisorEnabled` UInt8,
    `WebVisorActivity` UInt32,
    `ParsedParams` Nested(
        Key1 String,
        Key2 String,
        Key3 String,
        Key4 String,
        Key5 String,
        ValueDouble Float64),
    `Market` Nested(
        Type UInt8,
        GoalID UInt32,
        OrderID String,
        OrderPrice Int64,
        PP UInt32,
        DirectPlaceID UInt32,
        DirectOrderID UInt32,
        DirectBannerID UInt32,
        GoodID String,
        GoodName String,
        GoodQuantity Int32,
        GoodPrice Int64),
    `IslandID` FixedString(16)
)
ENGINE = CollapsingMergeTree(Sign)
PARTITION BY toYYYYMM(StartDate)
ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)
SAMPLE BY intHash32(UserID);



curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv
curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv


---客户端链接工具
clickhouse-client -m -u root -p root

查询正在进行执行的sql操作
SHOW PROCESSLIST

建表语句

create TABLE test.test( id Int32,create_date Date ,c2 Nullable(String) ) 
engine=MergeTree(create_date,id,(c3,c2),8192);

表变更预计

ALTER查询只支持MergeTree表，。该查询有几个变体。
ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|MODIFY COLUMN
--更改列的类型

alter TABLE test.ontime_wpp_t MODIFY COLUMN TailNum Nullable(String);

alter TABLE test.ontime_wpp_t ADD COLUMN TailNum2 Nullable(String)  after   Div5TailNum ;

alter TABLE test.ontime_wpp_t drop COLUMN TailNum2;

表变更数据系统监控
select * from system.mutations where is_done = 0 order by create_time desc limit 1;

删除表
drop table cdm_dwd.dwd_ord_car_sharing_df on cluster crm_4shards_1replicas;

删除数据

truncate table cdm_dwd.dwd_ord_car_sharing_df on cluster crm_4shards_1replicas;
 

变更表名
RENAME TABLE test.ontime_wpp_t to test.ontime_wpp_t2;

集群操作
RENAME TABLE cdm_dwd.dwd_ord_carsh_base_df2 to cdm_dwd.dwd_ord_carsh_base_df on cluster crm_4shards_1replicas;

1，数据导出
在相关节点执行：
echo 'select * from test.ads_user_portrait_vertical_df_cls' | curl localhost:8123?database=test -uroot:root -d @- > table_name.sql


2、导入数据，以tab作为分割符：

导入数据库的本机执行：

cat table_name.sql | clickhouse-client --query="INSERT INTO database.table_name FORMAT TabSeparated"
     

cat /root/user_lable_local_mid_cluster.tgz | clickhouse-client --user hadoop --password hadoop --query="INSERT INTO lmmbase.user_lable_local_mid_cluster FORMAT CSV";
cat /home/hadoop/work_wpp/user_label_uid_cluster | clickhouse-client --user hadoop --password hadoop --query="INSERT INTO lmmbase.user_label_uid_cluster FORMAT CSV";

cat /tmp/test_user2| clickhouse-client --user hadoop --password hadoop --query="INSERT INTO lmmbase.test_user2 FORMAT CSV";


插入语句
不严格插入数据，没有出现的列自动填充为默认值
INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22)

严格插入数据，每一列都必须出现在上面
INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23)

cat /tmp/user_point_info | clickhouse-client --query="INSERT INTO test.user_point_info FORMAT CSV";

clickhouse-client -m --user hadoop --password hadoop --query="truncate table lmmbase.user_label_uid on cluster crm_4shards_1replicas";

ssh hadoop@dn1 "/bin/bash /home/hadoop/app/otherApp/truncate_user_label_uid_data.sh"

clickhouse-client --query=" alter table  test.ads_user_portrait_vertical_df delete where create_time ='2019-10-17' ";

 

 

 

                              相关压测，同时执行相关sql，看下机器负载
*/2 * * * * clickhouse-client -m --query="select t_mac,t_type,count(*) cnt from carendpoint_porlog_cls group by t_mac,t_type order by cnt desc limit 100;"
*/2 * * * * clickhouse-client  -m --query="select t_mac,count(*) cnt from carendpoint_porlog_cls group by t_mac order by cnt desc limit 100;"
*/2 * * * * clickhouse-client  -m --query="select t_type,count(*) cnt from carendpoint_porlog_cls group by t_type order by cnt desc limit 100;"

*/1 * * * * clickhouse-client  -m --query="select t_ip,t_type,count(*) cnt from carendpoint_porlog_cls group by t_ip,t_type order by cnt desc limit 100;" >> /root/wpp1.log
*/1 * * * * clickhouse-client  -m --query="select t_ip,count(*) cnt from carendpoint_porlog_cls group by t_ip order by cnt desc limit 100;" >> /root/wpp2.log
*/1 * * * * clickhouse-client  -m --query="select event,count(*) cnt from carendpoint_porlog_cls group by event order by cnt desc limit 100;" >> /root/wpp2.log
————————————————
版权声明：本文为CSDN博主「wppwpp1」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/wppwpp1/article/details/106021619/


近期因为涉及Clickhouse 的一些操作、记录下自己的一些操作。

1.由于我们库使用的是MergeTree的引擎、所以支持update 和 delete

2.开始记录操作：

 以下是数据update 的脚本 、多个column的update 、只需要再update 后面跟上即可。

alter table summer_cn.bi_ime_sale_history update sale_date = toDateTime('2020-03-31 00:00:00')  where id in (
'IDVIVO_sale1245122',
'IDVIVO_sale1245174',
'IDVIVO_sale1245233',
'IDVIVO_sale1245173');
接下来是delete的sql、where条件后面跟的是筛选条件

alter table summer_cn.bi_ime_sale_history delete  where id in (
'IDVIVO_sale1245122',
'IDVIVO_sale1245174',
'IDVIVO_sale1245233',
'IDVIVO_sale1245173');
接下来是select 的sql、select的和普通数据库的查询方式是一致的。

select * from summer_cn.bi_ime_sale_history  where id in (
'IDVIVO_sale1245122',
'IDVIVO_sale1245174',
'IDVIVO_sale1245233',
'IDVIVO_sale1245173');




#!/bin/bash
#!/bin/bash
#两个参数：起始时间和终止时间，循环输出每天
#输入格式：20191001 20191031
#输出格式：2019-10-01 2019-10-31
startdate=`date -d "+0 day $1" +%Y%m%d`
enddate=`date -d "+0 day $2" +%Y%m%d`

while [[ $startdate -le $enddate ]]
do
date=$startdate
startdateother=`date -d "+0 day $date" +%Y-%m-%d`
dateother=$startdateother
echo "$dateother" >> /root/success.txt
startdate=`date -d "+1 day $date" +%Y%m%d`
startdateother=`date -d "+1 day $date" +%Y-%m-%d`

sql1="insert into database.table select '$dateother' as stat_date,vehicleno,vehiclecolor,count(vehicleno) as cn from database.table where up_date between '$dateother 00:00:00' and '$dateother 23:59:59' group by vehicleno,vehiclecolor"

sql2="insert into database.table select '$dateother' stat_date,vehicleno ,(max( toUInt32OrZero(vec3)) - min( toUInt32OrZero(vec3))) mileage,vehiclecolor,now() create_date from database.table where up_date between '$dateother 00:00:00' and '$dateother 23:59:59' group by vehicleno,vehiclecolor"

clickhouse-client --host XXX --port 9000 --user your user --password your password --multiquery -q"${sql1};${sql2}"

echo "$dateother 已完成" >> /root/success.txt

done

./xxx.sh 20191001 20191031
vim /root/success.txt

https://blog.51cto.com/wujianwei/3034589


CREATE TABLE test_table( 
    province        String, 
    province_name         String, 
    create_date           date 
) ENGINE = MergeTree(create_date, (province), 8192);



对建表sql的格式要求说明：
ENGINE：是表的引擎类型，最常用的MergeTree。还有一个Log引擎也是比较常用。MergeTree要求有一个日期字段，还有主键。Log没有这个限制。
create_date：是表的日期字段，一个表必须要有一个日期字段。
province：是表的主键，主键可以有多个字段，每个字段用逗号分隔
8192：是索引粒度，用默认值8192即可。

cat > test_table.csv << EOF
WA,WA_NAME,2020-08-25
CA,CA_NAME,2020-09-25
OR,OR_NAME,2020-10-25
EOF

/opt/data/test_table.csv


clickhouse-client --password 'N03P6VHv)bnz1@%xOg' --query "INSERT INTO tutorial.test_table FORMAT CSV" < test_table.csv;

cat test_table.csv | clickhouse-client --query “INSERT INTO tutorial.test_table FORMAT CSV”


clickhouse-client  -udefault --password='N03P6VHv)bnz1@%xOg' --query "select * from tutorial.test_table limit 2";
WA	WA_NAME	2020-08-25
CA	CA_NAME	2020-09-25
[root@tidb06 ~]# 
[root@tidb06 ~]# clickhouse-client  -udefault --password='N03P6VHv)bnz1@%xOg' --query "select * from tutorial.test_table01 limit 2";

hive外部分区表

-- dwd.p_min_yield_val definition

CREATE external TABLE `dwd.p2_min_yield_val`(
  `key` string, 
  `id` string, 
  `name` string, 
  `host_id` string, 
  `model_id` string, 
  `dev_group` string, 
  `path_id` string, 
  `staff_id` string, 
  `staff_name` string, 
  `class_type` string, 
  `variety` string, 
  `yarn_count` string, 
  `class_order` string, 
  `class_order_alias` string, 
  `efficiency` string, 
  `factor` string, 
  `output_m` string, 
  `output_kg` string, 
  `output_add_m` string, 
  `output_add_kg` string, 
  `htime` string, 
  `crt` string, 
  `online` string, 
  `current_shift` string)
PARTITIONED BY ( 
  `year` string, 
  `month` string, 
  `day` string, 
  `producer_id` string);
  
  
--0x10
-- dwd.p2_min_yield_val definition

CREATE EXTERNAL TABLE `dwd.p2_min_yield_val`(
  `key` string, 
  `id` string, 
  `name` string, 
  `host_id` string, 
  `model_id` string, 
  `dev_group` string, 
  `path_id` string, 
  `staff_id` string, 
  `staff_name` string, 
  `class_type` string, 
  `variety` string, 
  `yarn_count` string, 
  `class_order` string, 
  `class_order_alias` string, 
  `efficiency` string, 
  `factor` string, 
  `output_m` string, 
  `output_kg` string, 
  `output_add_m` string, 
  `output_add_kg` string, 
  `htime` string, 
  `crt` string, 
  `online` string, 
  `current_shift` string)
PARTITIONED BY ( 
  `year` string, 
  `month` string, 
  `day` string, 
  `producer_id` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://edh/user/hive/warehouse/dwd.db/p2_min_yield_val'
TBLPROPERTIES (
  'transient_lastDdlTime'='1639188365');

  
--0x11

hive 分区表

-- dwd.p_min_yield_val definition

CREATE TABLE `dwd.p_min_yield_val`(
  `key` string, 
  `id` string, 
  `name` string, 
  `host_id` string, 
  `model_id` string, 
  `dev_group` string, 
  `path_id` string, 
  `staff_id` string, 
  `staff_name` string, 
  `class_type` string, 
  `variety` string, 
  `yarn_count` string, 
  `class_order` string, 
  `class_order_alias` string, 
  `efficiency` string, 
  `factor` string, 
  `output_m` string, 
  `output_kg` string, 
  `output_add_m` string, 
  `output_add_kg` string, 
  `htime` string, 
  `crt` string, 
  `online` string, 
  `current_shift` string)
PARTITIONED BY ( 
  `year` string, 
  `month` string, 
  `day` string, 
  `producer_id` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
WITH SERDEPROPERTIES ( 
  'path'='hdfs://edh/user/hive/warehouse/dwd.db/p_min_yield_val') 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://edh/user/hive/warehouse/dwd.db/p_min_yield_val'
TBLPROPERTIES (
  'spark.sql.partitionProvider'='catalog', 
  'spark.sql.sources.provider'='parquet', 
  'spark.sql.sources.schema.numPartCols'='4', 
  'spark.sql.sources.schema.numParts'='1', 
  'spark.sql.sources.schema.part.0'='{\"type\":\"struct\",\"fields\":[{\"name\":\"key\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"host_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"model_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dev_group\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"path_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"staff_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"staff_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"class_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"variety\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"yarn_count\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"class_order\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"class_order_alias\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"efficiency\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"factor\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_m\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_kg\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_add_m\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_add_kg\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"htime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"crt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"online\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"current_shift\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"year\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"day\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"producer_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}', 
  'spark.sql.sources.schema.partCol.0'='year', 
  'spark.sql.sources.schema.partCol.1'='month', 
  'spark.sql.sources.schema.partCol.2'='day', 
  'spark.sql.sources.schema.partCol.3'='producer_id', 
  'transient_lastDdlTime'='1638949169');

hive 分区分桶表

-- dwd.p_min_yield_val definition

CREATE external TABLE `dwd.p3_min_yield_val`(
  `key` string, 
  `id` string, 
  `name` string, 
  `host_id` string, 
  `model_id` string, 
  `dev_group` string, 
  `path_id` string, 
  `staff_id` string, 
  `staff_name` string, 
  `class_type` string, 
  `variety` string, 
  `yarn_count` string, 
  `class_order` string, 
  `class_order_alias` string, 
  `efficiency` string, 
  `factor` string, 
  `output_m` string, 
  `output_kg` string, 
  `output_add_m` string, 
  `output_add_kg` string, 
  `htime` string, 
  `crt` string, 
  `online` string, 
  `current_shift` string)
PARTITIONED BY ( 
  `year` string, 
  `month` string, 
  `day` string, 
  `producer_id` string) clustered by (host_id) into 3 buckets;
  

CREATE external TABLE `dwd.p4_min_yield_val`(
  `key` string, 
  `id` string, 
  `name` string, 
  `host_id` string, 
  `model_id` string, 
  `dev_group` string, 
  `path_id` string, 
  `staff_id` string, 
  `staff_name` string, 
  `class_type` string, 
  `variety` string, 
  `yarn_count` string, 
  `class_order` string, 
  `class_order_alias` string, 
  `efficiency` string, 
  `factor` string, 
  `output_m` string, 
  `output_kg` string, 
  `output_add_m` string, 
  `output_add_kg` string, 
  `htime` string, 
  `crt` string, 
  `online` string, 
  `current_shift` string)
PARTITIONED BY ( 
  `year` string, 
  `month` string, 
  `day` string, 
  `producer_id` string) clustered by (class_order) into 3 buckets;



--- 向分区分桶表加载数据
---- start 分区+分桶 按class_order分桶
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;

insert into dwd.p4_min_yield_val partition(year,month,day,producer_id)  select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where crt='2021-12-10 09:00:03'
cluster by (class_order);


insert into dwd.p4_min_yield_val partition(year,month,day,producer_id)  select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where crt='2021-12-11 03:02:02'
cluster by (class_order);


---- end





  --01
  -- dwd.p3_min_yield_val definition

CREATE EXTERNAL TABLE `dwd.p3_min_yield_val`(
  `key` string, 
  `id` string, 
  `name` string, 
  `host_id` string, 
  `model_id` string, 
  `dev_group` string, 
  `path_id` string, 
  `staff_id` string, 
  `staff_name` string, 
  `class_type` string, 
  `variety` string, 
  `yarn_count` string, 
  `class_order` string, 
  `class_order_alias` string, 
  `efficiency` string, 
  `factor` string, 
  `output_m` string, 
  `output_kg` string, 
  `output_add_m` string, 
  `output_add_kg` string, 
  `htime` string, 
  `crt` string, 
  `online` string, 
  `current_shift` string)
PARTITIONED BY ( 
  `year` string, 
  `month` string, 
  `day` string, 
  `producer_id` string)
CLUSTERED BY ( 
  host_id) 
INTO 3 BUCKETS
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://edh/user/hive/warehouse/dwd.db/p3_min_yield_val'
TBLPROPERTIES (
  'transient_lastDdlTime'='1639188765');
  
  
-- 02  
设置强制分桶属性：

set hive.enforce.bucketing=false/true

如果reduce的个数和分桶的个数不一致时，请手动设置reduce的个数：
设置reduce task的个数：

set mapreduce.job.reduces=4;

分桶查询测试：

select * from buc1
cluster by (uid)

1、查询全部：

	select * from buc3;
	select * from buc3 tablesample(bucket 1 out of 1)  
	
	select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 1 on class_order)
WHERE producer_id = '2162304858206502921';


	 
查询第几桶：

select * from buc3 tablesample(bucket 1 out of 4 on uid);                            //除4余0
select * from buc3 tablesample(bucket 1 out of 2 on uid);  


-- 查询第3桶数据
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 3 on class_order)  //除3余0
WHERE producer_id = '2162304858206502921';

-- 查询第2桶数据 //除2余0
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';	 


-- 查询 class_order=2 的数据
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';	


select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 3 on class_order)
WHERE producer_id = '2162304858206502921';	

select *
from dwd.p4_min_yield_val tablesample(bucket 0 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';

-- 查询 class_order=2 偶数 的数据
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';


-- 查询 class_order=1 奇数 的数据
select *
from dwd.p4_min_yield_val tablesample(bucket 2 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';




set hive.enforce.bucketing = true;
1.让hive强制分桶，自动按照分桶表的bucket 进行分桶。(推荐)

set  hive.enforce.bucketing = true;



2.手动指定reduce数量

set mapreduce.job.reduces = num;

/

set mapreduce.reduce.tasks = num;

并在 SELECT 后增加CLUSTER BY 语句








	 
	 
解释一下：

	tablesample(bucket x out of y on uid)  
	x：代表从第几桶开始查询
	y：查询的总桶数，y可以是总的桶数的倍数或者因子；x不能大于y

	不压缩不拉伸：1 out of 4
	for 1 to 4
	1 2 3 4 1 2 3 4
	1       1 + 4

	压缩： 1 out of 2
	1 2 3 4
	1 2 1 2 1 2
	1   1 + 4/2 1+4/2+4/2

	拉伸：1 out of 8
	1 2 3 4 5 6 7 8
	1 2 3 4
 
等于是重新对所有数据进行分桶：

例子：
查询uid为奇数：（tablesample一定要紧跟在表名之后）

select * from buc3 tablesample(bucket 2 out of 2 on uid)

查询：

select * from part_tmp limit 3;
select * from part_tmp tablesample(3 rows);
select * from part_tmp tablesample(13 percent);
select * from part_tmp tablesample(56B); ###k B M G T P
 
 
 
随机查询3条数据：

select * from part_tmp order by rand() limit 3;
select rand() as rand1,* from part_tmp;
 
 
分桶总结：

	1、定义：
	clustered by (uid) – 指定分桶的字段
	sorted by (uid desc) – 指定数据的排序规则，表示预期的数据就是以这里设置的字段以及排序规则来进行存储

	2、导数据
	cluster by (uid) – 指定getPartition以哪个字段来进行hash散列，并且排序字段也是指定的字段，默认以正序进行排序

	distribute by(uid) – 指定getPartition以哪个字段来进行hash散列

	sort by(uid asc) – 指定排序字段，以及排序规则
	–更灵活的方式，这种数据获取方式可以分别指定getPartition的字段和sort的字段

	cluster by (uid)与distribute by(uid) sort by (uid asc)结果是一样的

分区下的分桶：

举例说明一下：按照性别进行分区（1男2女），在分区中按照uid的奇偶进行分桶：

1 gyy1 1
2 gyy2 2
3 gyy3 2
4 gyy4 1
5 gyy5 2
6 gyy6 1
7 gyy7 1
8 gyy8 2
9 gyy9 1
10 gyy10 1
11 gyy11 2
12 gyy12 1
 
 
 
1、创建临时表：

create table if not exists stu_temp(
uid int,
uname string,
usex int
)
row format delimited 
fields terminated by ' '
;



加载数据：

load data local inpath '/usr/local/hivedata/stu.dat' into table stu_temp;


创建分区分桶表：

create table if not exists stus(
uid int,
uname string
)
partitioned by (sex int)
clustered by (uid) into 2 buckets
row format delimited 
fields terminated by ' '
;

//insert into方式：

insert into table stus partition(sex)
select uid,uname,usex from stu_temp
cluster by (uid)
;


insert into dwd.p4_min_yield_val partition(year,month,day,producer_id)  select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val limit 30;

insert into dwd.p4_min_yield_val partition(year,month,day,producer_id)  select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val cluster by (class_order) limit 30;


---- start 分区+分桶 按class_order分桶
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;

insert into dwd.p4_min_yield_val partition(year,month,day,producer_id)  select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where crt='2021-12-10 09:00:03'
cluster by (class_order);

---- end

hive.enforce.bucketiong=true;


需求：
查询性别为女性的、并且学号为奇数的学生：

select *
from stus tablesample(bucket 2 out of 2 on uid)
where sex=2;

-- SELECT * FROM dwd.p_min_yield_val WHERE day = '2021-12-09' AND producer_id = '2162304858206502921' ORDER BY crt DESC;
--  查询class_order=1 的白班数据
select *
from dwd.p4_min_yield_val tablesample(bucket 2 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';



/user/hive/warehouse/dwd.db/p4_min_yield_val/year=2021/month=2021-12/day=2021-12-10



注意：
1、分区使用的是表外字段，分桶使用的是表内字段
2、分桶是更细粒度的划分、管理数据，更多用来做数据抽样、JOIN操作
————————————————
版权声明：本文为CSDN博主「brz_em」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/qq_35180983/article/details/82902943

文章来源:https://blog.csdn.net/apawnchuck_129/article/details/135091177
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：chenni525@qq.com进行投诉反馈，一经查实，立即删除！