HDP3.1 load csv to hive by partitione

HDP 3.1 平台,把csv文件动态load到hive分区表

  • 上传csv文件到HDFS
[d0pang@hadoop1 ~]$ hdfs dfs -put visit.csv /user/d0pang/
  • 创建临时表visit_tmp
    因为csv文件第一行为标题行,建表要增加设置TBLPROPERTIES(“skip.header.line.count”=”1”)
drop table visit_tmp; //创建之前先删除
CREATE TABLE `visit_tmp`( 
`visit_nbr` int,                                 
`store_nbr` smallint,                            
`visit_date` date,                          
`tot_visit_amt` decimal(19,2),                   
`tot_retail_price` decimal(19,2))
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
STORED AS TEXTFILE
TBLPROPERTIES("skip.header.line.count"="1");
  • 创建正式表visit
CREATE TABLE `visit`( 
`visit_nbr` int,                                 
`store_nbr` smallint,                            
`visit_date` date,                          
`tot_visit_amt` decimal(19,2),                   
`tot_retail_price` decimal(19,2)
)partitioned by (ts date) stored as orcfile;
  • 进入beeline, 把csv文件先load到临时表visit_tmp
load data inpath '/user/d0pang/visit.csv' overwrite into table d0pang_crypto.visit_tmp;
  • 进入beeline, 把临时表数据转入正式表
//设置动态分区参数
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
//插入数据到正式表
INSERT OVERWRITE TABLE visit PARTITION (ts) SELECT t1.*,t1.visit_date from visit_tmp t1;

//查看分区信息
show partitions visit;

Be the first to comment

Leave a Reply

Your email address will not be published.


*