clickhouse常用
CREATE DATABASE IF NOT EXISTS tutorial;
show databases;
CREATE TABLE tutorial.hits_v1
(
`WatchID` UInt64,
`JavaEnable` UInt8,
`Title` String,
`GoodEvent` Int16,
`EventTime` DateTime,
`EventDate` Date,
`CounterID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RegionID` UInt32,
`UserID` UInt64,
`CounterClass` Int8,
`OS` UInt8,
`UserAgent` UInt8,
`URL` String,
`Referer` String,
`URLDomain` String,
`RefererDomain` String,
`Refresh` UInt8,
`IsRobot` UInt8,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`FlashMinor2` String,
`NetMajor` UInt8,
`NetMinor` UInt8,
`UserAgentMajor` UInt16,
`UserAgentMinor` FixedString(2),
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`MobilePhone` UInt8,
`MobilePhoneModel` String,
`Params` String,
`IPNetworkID` UInt32,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`IsArtifical` UInt8,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`ClientTimeZone` Int16,
`ClientEventTime` DateTime,
`SilverlightVersion1` UInt8,
`SilverlightVersion2` UInt8,
`SilverlightVersion3` UInt32,
`SilverlightVersion4` UInt16,
`PageCharset` String,
`CodeVersion` UInt32,
`IsLink` UInt8,
`IsDownload` UInt8,
`IsNotBounce` UInt8,
`FUniqID` UInt64,
`HID` UInt32,
`IsOldCounter` UInt8,
`IsEvent` UInt8,
`IsParameter` UInt8,
`DontCountHits` UInt8,
`WithHash` UInt8,
`HitColor` FixedString(1),
`UTCEventTime` DateTime,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`WindowName` Int32,
`OpenerName` Int32,
`HistoryLength` Int16,
`BrowserLanguage` FixedString(2),
`BrowserCountry` FixedString(2),
`SocialNetwork` String,
`SocialAction` String,
`HTTPError` UInt16,
`SendTiming` Int32,
`DNSTiming` Int32,
`ConnectTiming` Int32,
`ResponseStartTiming` Int32,
`ResponseEndTiming` Int32,
`FetchTiming` Int32,
`RedirectTiming` Int32,
`DOMInteractiveTiming` Int32,
`DOMContentLoadedTiming` Int32,
`DOMCompleteTiming` Int32,
`LoadEventStartTiming` Int32,
`LoadEventEndTiming` Int32,
`NSToDOMContentLoadedTiming` Int32,
`FirstPaintTiming` Int32,
`RedirectCount` Int8,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`ParamPrice` Int64,
`ParamOrderID` String,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`GoalsReached` Array(UInt32),
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`RefererHash` UInt64,
`URLHash` UInt64,
`CLID` UInt32,
`YCLID` UInt64,
`ShareService` String,
`ShareURL` String,
`ShareTitle` String,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`IslandID` FixedString(16),
`RequestNum` UInt32,
`RequestTry` UInt8
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID);
CREATE TABLE tutorial.visits_v1
(
`CounterID` UInt32,
`StartDate` Date,
`Sign` Int8,
`IsNew` UInt8,
`VisitID` UInt64,
`UserID` UInt64,
`StartTime` DateTime,
`Duration` UInt32,
`UTCStartTime` DateTime,
`PageViews` Int32,
`Hits` Int32,
`IsBounce` UInt8,
`Referer` String,
`StartURL` String,
`RefererDomain` String,
`StartURLDomain` String,
`EndURL` String,
`LinkURL` String,
`IsDownload` UInt8,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`PlaceID` Int32,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`IsYandex` UInt8,
`GoalReachesDepth` Int32,
`GoalReachesURL` Int32,
`GoalReachesAny` Int32,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`MobilePhoneModel` String,
`ClientEventTime` DateTime,
`RegionID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`IPNetworkID` UInt32,
`SilverlightVersion3` UInt32,
`CodeVersion` UInt32,
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`UserAgentMajor` UInt16,
`UserAgentMinor` UInt16,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`SilverlightVersion2` UInt8,
`SilverlightVersion4` UInt16,
`FlashVersion3` UInt16,
`FlashVersion4` UInt16,
`ClientTimeZone` Int16,
`OS` UInt8,
`UserAgent` UInt8,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`NetMajor` UInt8,
`NetMinor` UInt8,
`MobilePhone` UInt8,
`SilverlightVersion1` UInt8,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`JavaEnable` UInt8,
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`BrowserLanguage` UInt16,
`BrowserCountry` UInt16,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`Params` Array(String),
`Goals` Nested(
ID UInt32,
Serial UInt32,
EventTime DateTime,
Price Int64,
OrderID String,
CurrencyID UInt32),
`WatchIDs` Array(UInt64),
`ParamSumPrice` Int64,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`ClickLogID` UInt64,
`ClickEventID` Int32,
`ClickGoodEvent` Int32,
`ClickEventTime` DateTime,
`ClickPriorityID` Int32,
`ClickPhraseID` Int32,
`ClickPageID` Int32,
`ClickPlaceID` Int32,
`ClickTypeID` Int32,
`ClickResourceID` Int32,
`ClickCost` UInt32,
`ClickClientIP` UInt32,
`ClickDomainID` UInt32,
`ClickURL` String,
`ClickAttempt` UInt8,
`ClickOrderID` UInt32,
`ClickBannerID` UInt32,
`ClickMarketCategoryID` UInt32,
`ClickMarketPP` UInt32,
`ClickMarketCategoryName` String,
`ClickMarketPPName` String,
`ClickAWAPSCampaignName` String,
`ClickPageName` String,
`ClickTargetType` UInt16,
`ClickTargetPhraseID` UInt64,
`ClickContextType` UInt8,
`ClickSelectType` Int8,
`ClickOptions` String,
`ClickGroupBannerID` Int32,
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`FirstVisit` DateTime,
`PredLastVisit` Date,
`LastVisit` Date,
`TotalVisits` UInt32,
`TraficSource` Nested(
ID Int8,
SearchEngineID UInt16,
AdvEngineID UInt8,
PlaceID UInt16,
SocialSourceNetworkID UInt8,
Domain String,
SearchPhrase String,
SocialSourcePage String),
`Attendance` FixedString(16),
`CLID` UInt32,
`YCLID` UInt64,
`NormalizedRefererHash` UInt64,
`SearchPhraseHash` UInt64,
`RefererDomainHash` UInt64,
`NormalizedStartURLHash` UInt64,
`StartURLDomainHash` UInt64,
`NormalizedEndURLHash` UInt64,
`TopLevelDomain` UInt64,
`URLScheme` UInt64,
`OpenstatServiceNameHash` UInt64,
`OpenstatCampaignIDHash` UInt64,
`OpenstatAdIDHash` UInt64,
`OpenstatSourceIDHash` UInt64,
`UTMSourceHash` UInt64,
`UTMMediumHash` UInt64,
`UTMCampaignHash` UInt64,
`UTMContentHash` UInt64,
`UTMTermHash` UInt64,
`FromHash` UInt64,
`WebVisorEnabled` UInt8,
`WebVisorActivity` UInt32,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`Market` Nested(
Type UInt8,
GoalID UInt32,
OrderID String,
OrderPrice Int64,
PP UInt32,
DirectPlaceID UInt32,
DirectOrderID UInt32,
DirectBannerID UInt32,
GoodID String,
GoodName String,
GoodQuantity Int32,
GoodPrice Int64),
`IslandID` FixedString(16)
)
ENGINE = CollapsingMergeTree(Sign)
PARTITION BY toYYYYMM(StartDate)
ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)
SAMPLE BY intHash32(UserID);
curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv
curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv
---客户端链接工具
clickhouse-client -m -u root -p root
查询正在进行执行的sql操作
SHOW PROCESSLIST
建表语句
create TABLE test.test( id Int32,create_date Date ,c2 Nullable(String) )
engine=MergeTree(create_date,id,(c3,c2),8192);
表变更预计
ALTER查询只支持MergeTree表,。该查询有几个变体。
ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|MODIFY COLUMN
--更改列的类型
alter TABLE test.ontime_wpp_t MODIFY COLUMN TailNum Nullable(String);
alter TABLE test.ontime_wpp_t ADD COLUMN TailNum2 Nullable(String) after Div5TailNum ;
alter TABLE test.ontime_wpp_t drop COLUMN TailNum2;
表变更数据系统监控
select * from system.mutations where is_done = 0 order by create_time desc limit 1;
删除表
drop table cdm_dwd.dwd_ord_car_sharing_df on cluster crm_4shards_1replicas;
删除数据
truncate table cdm_dwd.dwd_ord_car_sharing_df on cluster crm_4shards_1replicas;
变更表名
RENAME TABLE test.ontime_wpp_t to test.ontime_wpp_t2;
集群操作
RENAME TABLE cdm_dwd.dwd_ord_carsh_base_df2 to cdm_dwd.dwd_ord_carsh_base_df on cluster crm_4shards_1replicas;
1,数据导出
在相关节点执行:
echo 'select * from test.ads_user_portrait_vertical_df_cls' | curl localhost:8123?database=test -uroot:root -d @- > table_name.sql
2、导入数据,以tab作为分割符:
导入数据库的本机执行:
cat table_name.sql | clickhouse-client --query="INSERT INTO database.table_name FORMAT TabSeparated"
cat /root/user_lable_local_mid_cluster.tgz | clickhouse-client --user hadoop --password hadoop --query="INSERT INTO lmmbase.user_lable_local_mid_cluster FORMAT CSV";
cat /home/hadoop/work_wpp/user_label_uid_cluster | clickhouse-client --user hadoop --password hadoop --query="INSERT INTO lmmbase.user_label_uid_cluster FORMAT CSV";
cat /tmp/test_user2| clickhouse-client --user hadoop --password hadoop --query="INSERT INTO lmmbase.test_user2 FORMAT CSV";
插入语句
不严格插入数据,没有出现的列自动填充为默认值
INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22)
严格插入数据,每一列都必须出现在上面
INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23)
cat /tmp/user_point_info | clickhouse-client --query="INSERT INTO test.user_point_info FORMAT CSV";
clickhouse-client -m --user hadoop --password hadoop --query="truncate table lmmbase.user_label_uid on cluster crm_4shards_1replicas";
ssh hadoop@dn1 "/bin/bash /home/hadoop/app/otherApp/truncate_user_label_uid_data.sh"
clickhouse-client --query=" alter table test.ads_user_portrait_vertical_df delete where create_time ='2019-10-17' ";
相关压测,同时执行相关sql,看下机器负载
*/2 * * * * clickhouse-client -m --query="select t_mac,t_type,count(*) cnt from carendpoint_porlog_cls group by t_mac,t_type order by cnt desc limit 100;"
*/2 * * * * clickhouse-client -m --query="select t_mac,count(*) cnt from carendpoint_porlog_cls group by t_mac order by cnt desc limit 100;"
*/2 * * * * clickhouse-client -m --query="select t_type,count(*) cnt from carendpoint_porlog_cls group by t_type order by cnt desc limit 100;"
*/1 * * * * clickhouse-client -m --query="select t_ip,t_type,count(*) cnt from carendpoint_porlog_cls group by t_ip,t_type order by cnt desc limit 100;" >> /root/wpp1.log
*/1 * * * * clickhouse-client -m --query="select t_ip,count(*) cnt from carendpoint_porlog_cls group by t_ip order by cnt desc limit 100;" >> /root/wpp2.log
*/1 * * * * clickhouse-client -m --query="select event,count(*) cnt from carendpoint_porlog_cls group by event order by cnt desc limit 100;" >> /root/wpp2.log
————————————————
版权声明:本文为CSDN博主「wppwpp1」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/wppwpp1/article/details/106021619/
近期因为涉及Clickhouse 的一些操作、记录下自己的一些操作。
1.由于我们库使用的是MergeTree的引擎、所以支持update 和 delete
2.开始记录操作:
以下是数据update 的脚本 、多个column的update 、只需要再update 后面跟上即可。
alter table summer_cn.bi_ime_sale_history update sale_date = toDateTime('2020-03-31 00:00:00') where id in (
'IDVIVO_sale1245122',
'IDVIVO_sale1245174',
'IDVIVO_sale1245233',
'IDVIVO_sale1245173');
接下来是delete的sql、where条件后面跟的是筛选条件
alter table summer_cn.bi_ime_sale_history delete where id in (
'IDVIVO_sale1245122',
'IDVIVO_sale1245174',
'IDVIVO_sale1245233',
'IDVIVO_sale1245173');
接下来是select 的sql、select的和普通数据库的查询方式是一致的。
select * from summer_cn.bi_ime_sale_history where id in (
'IDVIVO_sale1245122',
'IDVIVO_sale1245174',
'IDVIVO_sale1245233',
'IDVIVO_sale1245173');
#!/bin/bash
#!/bin/bash
#两个参数:起始时间和终止时间,循环输出每天
#输入格式:20191001 20191031
#输出格式:2019-10-01 2019-10-31
startdate=`date -d "+0 day $1" +%Y%m%d`
enddate=`date -d "+0 day $2" +%Y%m%d`
while [[ $startdate -le $enddate ]]
do
date=$startdate
startdateother=`date -d "+0 day $date" +%Y-%m-%d`
dateother=$startdateother
echo "$dateother" >> /root/success.txt
startdate=`date -d "+1 day $date" +%Y%m%d`
startdateother=`date -d "+1 day $date" +%Y-%m-%d`
sql1="insert into database.table select '$dateother' as stat_date,vehicleno,vehiclecolor,count(vehicleno) as cn from database.table where up_date between '$dateother 00:00:00' and '$dateother 23:59:59' group by vehicleno,vehiclecolor"
sql2="insert into database.table select '$dateother' stat_date,vehicleno ,(max( toUInt32OrZero(vec3)) - min( toUInt32OrZero(vec3))) mileage,vehiclecolor,now() create_date from database.table where up_date between '$dateother 00:00:00' and '$dateother 23:59:59' group by vehicleno,vehiclecolor"
clickhouse-client --host XXX --port 9000 --user your user --password your password --multiquery -q"${sql1};${sql2}"
echo "$dateother 已完成" >> /root/success.txt
done
./xxx.sh 20191001 20191031
vim /root/success.txt
https://blog.51cto.com/wujianwei/3034589
CREATE TABLE test_table(
province String,
province_name String,
create_date date
) ENGINE = MergeTree(create_date, (province), 8192);
对建表sql的格式要求说明:
ENGINE:是表的引擎类型,最常用的MergeTree。还有一个Log引擎也是比较常用。MergeTree要求有一个日期字段,还有主键。Log没有这个限制。
create_date:是表的日期字段,一个表必须要有一个日期字段。
province:是表的主键,主键可以有多个字段,每个字段用逗号分隔
8192:是索引粒度,用默认值8192即可。
cat > test_table.csv << EOF
WA,WA_NAME,2020-08-25
CA,CA_NAME,2020-09-25
OR,OR_NAME,2020-10-25
EOF
/opt/data/test_table.csv
clickhouse-client --password 'N03P6VHv)bnz1@%xOg' --query "INSERT INTO tutorial.test_table FORMAT CSV" < test_table.csv;
cat test_table.csv | clickhouse-client --query “INSERT INTO tutorial.test_table FORMAT CSV”
clickhouse-client -udefault --password='N03P6VHv)bnz1@%xOg' --query "select * from tutorial.test_table limit 2";
WA WA_NAME 2020-08-25
CA CA_NAME 2020-09-25
[root@tidb06 ~]#
[root@tidb06 ~]# clickhouse-client -udefault --password='N03P6VHv)bnz1@%xOg' --query "select * from tutorial.test_table01 limit 2";
hive外部分区表
-- dwd.p_min_yield_val definition
CREATE external TABLE `dwd.p2_min_yield_val`(
`key` string,
`id` string,
`name` string,
`host_id` string,
`model_id` string,
`dev_group` string,
`path_id` string,
`staff_id` string,
`staff_name` string,
`class_type` string,
`variety` string,
`yarn_count` string,
`class_order` string,
`class_order_alias` string,
`efficiency` string,
`factor` string,
`output_m` string,
`output_kg` string,
`output_add_m` string,
`output_add_kg` string,
`htime` string,
`crt` string,
`online` string,
`current_shift` string)
PARTITIONED BY (
`year` string,
`month` string,
`day` string,
`producer_id` string);
--0x10
-- dwd.p2_min_yield_val definition
CREATE EXTERNAL TABLE `dwd.p2_min_yield_val`(
`key` string,
`id` string,
`name` string,
`host_id` string,
`model_id` string,
`dev_group` string,
`path_id` string,
`staff_id` string,
`staff_name` string,
`class_type` string,
`variety` string,
`yarn_count` string,
`class_order` string,
`class_order_alias` string,
`efficiency` string,
`factor` string,
`output_m` string,
`output_kg` string,
`output_add_m` string,
`output_add_kg` string,
`htime` string,
`crt` string,
`online` string,
`current_shift` string)
PARTITIONED BY (
`year` string,
`month` string,
`day` string,
`producer_id` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://edh/user/hive/warehouse/dwd.db/p2_min_yield_val'
TBLPROPERTIES (
'transient_lastDdlTime'='1639188365');
--0x11
hive 分区表
-- dwd.p_min_yield_val definition
CREATE TABLE `dwd.p_min_yield_val`(
`key` string,
`id` string,
`name` string,
`host_id` string,
`model_id` string,
`dev_group` string,
`path_id` string,
`staff_id` string,
`staff_name` string,
`class_type` string,
`variety` string,
`yarn_count` string,
`class_order` string,
`class_order_alias` string,
`efficiency` string,
`factor` string,
`output_m` string,
`output_kg` string,
`output_add_m` string,
`output_add_kg` string,
`htime` string,
`crt` string,
`online` string,
`current_shift` string)
PARTITIONED BY (
`year` string,
`month` string,
`day` string,
`producer_id` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES (
'path'='hdfs://edh/user/hive/warehouse/dwd.db/p_min_yield_val')
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://edh/user/hive/warehouse/dwd.db/p_min_yield_val'
TBLPROPERTIES (
'spark.sql.partitionProvider'='catalog',
'spark.sql.sources.provider'='parquet',
'spark.sql.sources.schema.numPartCols'='4',
'spark.sql.sources.schema.numParts'='1',
'spark.sql.sources.schema.part.0'='{\"type\":\"struct\",\"fields\":[{\"name\":\"key\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"host_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"model_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dev_group\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"path_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"staff_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"staff_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"class_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"variety\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"yarn_count\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"class_order\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"class_order_alias\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"efficiency\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"factor\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_m\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_kg\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_add_m\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"output_add_kg\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"htime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"crt\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"online\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"current_shift\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"year\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"day\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"producer_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}',
'spark.sql.sources.schema.partCol.0'='year',
'spark.sql.sources.schema.partCol.1'='month',
'spark.sql.sources.schema.partCol.2'='day',
'spark.sql.sources.schema.partCol.3'='producer_id',
'transient_lastDdlTime'='1638949169');
hive 分区分桶表
-- dwd.p_min_yield_val definition
CREATE external TABLE `dwd.p3_min_yield_val`(
`key` string,
`id` string,
`name` string,
`host_id` string,
`model_id` string,
`dev_group` string,
`path_id` string,
`staff_id` string,
`staff_name` string,
`class_type` string,
`variety` string,
`yarn_count` string,
`class_order` string,
`class_order_alias` string,
`efficiency` string,
`factor` string,
`output_m` string,
`output_kg` string,
`output_add_m` string,
`output_add_kg` string,
`htime` string,
`crt` string,
`online` string,
`current_shift` string)
PARTITIONED BY (
`year` string,
`month` string,
`day` string,
`producer_id` string) clustered by (host_id) into 3 buckets;
CREATE external TABLE `dwd.p4_min_yield_val`(
`key` string,
`id` string,
`name` string,
`host_id` string,
`model_id` string,
`dev_group` string,
`path_id` string,
`staff_id` string,
`staff_name` string,
`class_type` string,
`variety` string,
`yarn_count` string,
`class_order` string,
`class_order_alias` string,
`efficiency` string,
`factor` string,
`output_m` string,
`output_kg` string,
`output_add_m` string,
`output_add_kg` string,
`htime` string,
`crt` string,
`online` string,
`current_shift` string)
PARTITIONED BY (
`year` string,
`month` string,
`day` string,
`producer_id` string) clustered by (class_order) into 3 buckets;
--- 向分区分桶表加载数据
---- start 分区+分桶 按class_order分桶
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert into dwd.p4_min_yield_val partition(year,month,day,producer_id) select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where crt='2021-12-10 09:00:03'
cluster by (class_order);
insert into dwd.p4_min_yield_val partition(year,month,day,producer_id) select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where crt='2021-12-11 03:02:02'
cluster by (class_order);
---- end
--01
-- dwd.p3_min_yield_val definition
CREATE EXTERNAL TABLE `dwd.p3_min_yield_val`(
`key` string,
`id` string,
`name` string,
`host_id` string,
`model_id` string,
`dev_group` string,
`path_id` string,
`staff_id` string,
`staff_name` string,
`class_type` string,
`variety` string,
`yarn_count` string,
`class_order` string,
`class_order_alias` string,
`efficiency` string,
`factor` string,
`output_m` string,
`output_kg` string,
`output_add_m` string,
`output_add_kg` string,
`htime` string,
`crt` string,
`online` string,
`current_shift` string)
PARTITIONED BY (
`year` string,
`month` string,
`day` string,
`producer_id` string)
CLUSTERED BY (
host_id)
INTO 3 BUCKETS
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://edh/user/hive/warehouse/dwd.db/p3_min_yield_val'
TBLPROPERTIES (
'transient_lastDdlTime'='1639188765');
-- 02
设置强制分桶属性:
set hive.enforce.bucketing=false/true
如果reduce的个数和分桶的个数不一致时,请手动设置reduce的个数:
设置reduce task的个数:
set mapreduce.job.reduces=4;
分桶查询测试:
select * from buc1
cluster by (uid)
1、查询全部:
select * from buc3;
select * from buc3 tablesample(bucket 1 out of 1)
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 1 on class_order)
WHERE producer_id = '2162304858206502921';
查询第几桶:
select * from buc3 tablesample(bucket 1 out of 4 on uid); //除4余0
select * from buc3 tablesample(bucket 1 out of 2 on uid);
-- 查询第3桶数据
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 3 on class_order) //除3余0
WHERE producer_id = '2162304858206502921';
-- 查询第2桶数据 //除2余0
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';
-- 查询 class_order=2 的数据
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 3 on class_order)
WHERE producer_id = '2162304858206502921';
select *
from dwd.p4_min_yield_val tablesample(bucket 0 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';
-- 查询 class_order=2 偶数 的数据
select *
from dwd.p4_min_yield_val tablesample(bucket 1 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';
-- 查询 class_order=1 奇数 的数据
select *
from dwd.p4_min_yield_val tablesample(bucket 2 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';
set hive.enforce.bucketing = true;
1.让hive强制分桶,自动按照分桶表的bucket 进行分桶。(推荐)
set hive.enforce.bucketing = true;
2.手动指定reduce数量
set mapreduce.job.reduces = num;
/
set mapreduce.reduce.tasks = num;
并在 SELECT 后增加CLUSTER BY 语句
解释一下:
tablesample(bucket x out of y on uid)
x:代表从第几桶开始查询
y:查询的总桶数,y可以是总的桶数的倍数或者因子;x不能大于y
不压缩不拉伸:1 out of 4
for 1 to 4
1 2 3 4 1 2 3 4
1 1 + 4
压缩: 1 out of 2
1 2 3 4
1 2 1 2 1 2
1 1 + 4/2 1+4/2+4/2
拉伸:1 out of 8
1 2 3 4 5 6 7 8
1 2 3 4
等于是重新对所有数据进行分桶:
例子:
查询uid为奇数:(tablesample一定要紧跟在表名之后)
select * from buc3 tablesample(bucket 2 out of 2 on uid)
查询:
select * from part_tmp limit 3;
select * from part_tmp tablesample(3 rows);
select * from part_tmp tablesample(13 percent);
select * from part_tmp tablesample(56B); ###k B M G T P
随机查询3条数据:
select * from part_tmp order by rand() limit 3;
select rand() as rand1,* from part_tmp;
分桶总结:
1、定义:
clustered by (uid) – 指定分桶的字段
sorted by (uid desc) – 指定数据的排序规则,表示预期的数据就是以这里设置的字段以及排序规则来进行存储
2、导数据
cluster by (uid) – 指定getPartition以哪个字段来进行hash散列,并且排序字段也是指定的字段,默认以正序进行排序
distribute by(uid) – 指定getPartition以哪个字段来进行hash散列
sort by(uid asc) – 指定排序字段,以及排序规则
–更灵活的方式,这种数据获取方式可以分别指定getPartition的字段和sort的字段
cluster by (uid)与distribute by(uid) sort by (uid asc)结果是一样的
分区下的分桶:
举例说明一下:按照性别进行分区(1男2女),在分区中按照uid的奇偶进行分桶:
1 gyy1 1
2 gyy2 2
3 gyy3 2
4 gyy4 1
5 gyy5 2
6 gyy6 1
7 gyy7 1
8 gyy8 2
9 gyy9 1
10 gyy10 1
11 gyy11 2
12 gyy12 1
1、创建临时表:
create table if not exists stu_temp(
uid int,
uname string,
usex int
)
row format delimited
fields terminated by ' '
;
加载数据:
load data local inpath '/usr/local/hivedata/stu.dat' into table stu_temp;
创建分区分桶表:
create table if not exists stus(
uid int,
uname string
)
partitioned by (sex int)
clustered by (uid) into 2 buckets
row format delimited
fields terminated by ' '
;
//insert into方式:
insert into table stus partition(sex)
select uid,uname,usex from stu_temp
cluster by (uid)
;
insert into dwd.p4_min_yield_val partition(year,month,day,producer_id) select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val limit 30;
insert into dwd.p4_min_yield_val partition(year,month,day,producer_id) select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val cluster by (class_order) limit 30;
---- start 分区+分桶 按class_order分桶
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert into dwd.p4_min_yield_val partition(year,month,day,producer_id) select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where crt='2021-12-10 09:00:03'
cluster by (class_order);
---- end
hive.enforce.bucketiong=true;
需求:
查询性别为女性的、并且学号为奇数的学生:
select *
from stus tablesample(bucket 2 out of 2 on uid)
where sex=2;
-- SELECT * FROM dwd.p_min_yield_val WHERE day = '2021-12-09' AND producer_id = '2162304858206502921' ORDER BY crt DESC;
-- 查询class_order=1 的白班数据
select *
from dwd.p4_min_yield_val tablesample(bucket 2 out of 2 on class_order)
WHERE producer_id = '2162304858206502921';
/user/hive/warehouse/dwd.db/p4_min_yield_val/year=2021/month=2021-12/day=2021-12-10
注意:
1、分区使用的是表外字段,分桶使用的是表内字段
2、分桶是更细粒度的划分、管理数据,更多用来做数据抽样、JOIN操作
————————————————
版权声明:本文为CSDN博主「brz_em」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_35180983/article/details/82902943