（37）DWD 层（业务数据）

本文主要是介绍（37）DWD 层（业务数据），希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

业务数据方面 DWD 层的搭建主要注意点在于维度建模，减少后续大量 Join 操作。

1. 商品维度表（全量）

商品维度表主要是将商品表 SKU 表、商品一级分类、商品二级分类、商品三级分类、

商品品牌表和商品 SPU 表联接为商品表。

1 ）建表语句

hive (gmall)>

DROP TABLE IF EXISTS `dwd_dim_sku_info`;

CREATE EXTERNAL TABLE `dwd_dim_sku_info` (

`id` string COMMENT ' 商品 id',

`spu_id` string COMMENT 'spuid',

`price` decimal(16,2) COMMENT ' 商品价格 ',

`sku_name` string COMMENT ' 商品名称 ',

`sku_desc` string COMMENT ' 商品描述 ',

`weight` decimal(16,2) COMMENT ' 重量 ',

`tm_id` string COMMENT ' 品牌 id',

`tm_name` string COMMENT ' 品牌名称 ',

`category3_id` string COMMENT ' 三级分类 id',

`category2_id` string COMMENT ' 二级分类 id',

`category1_id` string COMMENT ' 一级分类 id',

`category3_name` string COMMENT ' 三级分类名称 ',

`category2_name` string COMMENT ' 二级分类名称 ',

`category1_name` string COMMENT ' 一级分类名称 ',

`spu_name` string COMMENT 'spu 名称 ',

`create_time` string COMMENT ' 创建时间 '

) COMMENT ' 商品维度表 '

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_sku_info/'

tblproperties ("parquet.compression"="lzo");

2）数据装载

hive (gmall)>

SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

insert overwrite table dwd_dim_sku_info partition(dt='2020-06-14')

select

sku.id,

sku.spu_id,

sku.price,

sku.sku_name,

sku.sku_desc,

sku.weight,

sku.tm_id,

ob.tm_name,

sku.category3_id,

c2.id category2_id,

c1.id category1_id,

c3.name category3_name,

c2.name category2_name,

c1.name category1_name,

spu.spu_name,

sku.create_time

from

(

select * from ods_sku_info where dt='2020-06-14'

)sku

join

(

select * from ods_base_trademark where dt='2020-06-14'

)ob on sku.tm_id=ob.tm_id

join

(

select * from ods_spu_info where dt='2020-06-14'

)spu on spu.id = sku.spu_id

join

(

select * from ods_base_category3 where dt='2020-06-14'

)c3 on sku.category3_id=c3.id

join

(

select * from ods_base_category2 where dt='2020-06-14'

)c2 on c3.category2_id=c2.id

join

(

select * from ods_base_category1 where dt='2020-06-14'

)c1 on c2.category1_id=c1.id;

3 ）查询加载结果

hive (gmall)> select * from dwd_dim_sku_info where dt='2020-06-14' limit 2;

2.优惠券维度表（全量）

把 ODS 层 ods_coupon_info 表数据导入到 DWD 层优惠卷维度表，在导入过程中可以做

适当的清洗。

1 ）建表语句

hive (gmall)>

drop table if exists dwd_dim_coupon_info;

create external table dwd_dim_coupon_info(

`id` string COMMENT ' 购物券编号 ',

`coupon_name` string COMMENT ' 购物券名称 ',

`coupon_type` string COMMENT ' 购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券 ',

`condition_amount` decimal(16,2) COMMENT ' 满额数 ',

`condition_num` bigint COMMENT ' 满件数 ',

`activity_id` string COMMENT ' 活动编号 ',

`benefit_amount` decimal(16,2) COMMENT ' 减金额 ',

`benefit_discount` decimal(16,2) COMMENT ' 折扣 ',

`create_time` string COMMENT ' 创建时间 ',

`range_type` string COMMENT ' 范围类型 1 、商品 2 、品类 3 、品牌 ',

`spu_id` string COMMENT ' 商品 id',

`tm_id` string COMMENT ' 品牌 id',

`category3_id` string COMMENT ' 品类 id',

`limit_num` bigint COMMENT ' 最多领用次数 ',

`operate_time` string COMMENT ' 修改时间 ',

`expire_time` string COMMENT ' 过期时间 '

) COMMENT ' 优惠券维度表 '

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_coupon_info/'

tblproperties ("parquet.compression"="lzo");

2 ）数据装载

hive (gmall)>

SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

insert overwrite table dwd_dim_coupon_info partition(dt='2020-06-14')

select

id,

coupon_name,

coupon_type,

condition_amount,

condition_num,

activity_id,

benefit_amount,

benefit_discount,

create_time,

range_type,

spu_id,

tm_id,

category3_id,

limit_num,

operate_time,

expire_time

from ods_coupon_info

where dt='2020-06-14';

3 ）查询加载结果

hive (gmall)> select * from dwd_dim_coupon_info where dt='2020-06-14' limit 2;

3 活动维度表（全量）

1 ）建表语句

hive (gmall)>

drop table if exists dwd_dim_activity_info;

create external table dwd_dim_activity_info(

`id` string COMMENT ' 编号 ',

`activity_name` string COMMENT ' 活动名称 ',

`activity_type` string COMMENT ' 活动类型 ',

`start_time` string COMMENT ' 开始时间 ',

`end_time` string COMMENT ' 结束时间 ',

`create_time` string COMMENT ' 创建时间 '

) COMMENT ' 活动信息表 '

PARTITIONED BY (`dt` string)

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_activity_info/'

tblproperties ("parquet.compression"="lzo");

2 ）数据装载

hive (gmall)>

SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

insert overwrite table dwd_dim_activity_info partition(dt='2020-06-14')

select

id,

activity_name,

activity_type,

start_time,

end_time,

create_time

from ods_activity_info

where dt='2020-06-14';

3 ）查询加载结果

hive (gmall)> select * from dwd_dim_activity_info where dt='2020-06-14' limit 2;

4 地区维度表（特殊）

1 ）建表语句

hive (gmall)>

DROP TABLE IF EXISTS `dwd_dim_base_province`;

CREATE EXTERNAL TABLE `dwd_dim_base_province` (

`id` string COMMENT 'id',

`province_name` string COMMENT ' 省市名称 ',

`area_code` string COMMENT ' 地区编码 ',

`iso_code` string COMMENT 'ISO 编码 ',

`region_id` string COMMENT ' 地区 id',

`region_name` string COMMENT ' 地区名称 '

) COMMENT ' 地区维度表 '

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_base_province/'

tblproperties ("parquet.compression"="lzo");

2 ）数据装载

hive (gmall)>

SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

insert overwrite table dwd_dim_base_province

select

bp.id,

bp.name,

bp.area_code,

bp.iso_code,

bp.region_id,

br.region_name

from

(

select * from ods_base_province

) bp

join

(

select * from ods_base_region

) br

on bp.region_id = br.id;

3 ）查询加载结果

hive (gmall)> select * from dwd_dim_base_province limit 2;

5 时间维度表（特殊）

1 ）建表语句

hive (gmall)>

DROP TABLE IF EXISTS `dwd_dim_date_info`;

CREATE EXTERNAL TABLE `dwd_dim_date_info`(

`date_id` string COMMENT ' 日 ',

`week_id` string COMMENT ' 周 ',

`week_day` string COMMENT ' 周的第几天 ',

`day` string COMMENT ' 每月的第几天 ',

`month` string COMMENT ' 第几月 ',

`quarter` string COMMENT ' 第几季度 ',

`year` string COMMENT ' 年 ',

`is_workday` string COMMENT ' 是否是周末 ',

`holiday_id` string COMMENT ' 是否是节假日 '

) COMMENT ' 时间维度表 '

stored as parquet

location '/warehouse/gmall/dwd/dwd_dim_date_info/'

tblproperties ("parquet.compression"="lzo");

2 ）把 date_info.txt 文件上传到 hadoop102 的 /opt/module/db_log/ 路径

3 ）数据装载

注意：由于 dwd_dim_date_info 是列式存储 +LZO 压缩。直接将 date_info.txt 文件导入到

目标表，并不会直接转换为列式存储 +LZO 压缩。我们需要创建一张普通的临时表

dwd_dim_date_info_tmp ，将 date_info.txt 加载到该临时表中。最后通过查询临时表数据，把

查询到的数据插入到最终的目标表中。

（ 1 ）创建临时表，非列式存储

hive (gmall)>

DROP TABLE IF EXISTS `dwd_dim_date_info_tmp`;

CREATE EXTERNAL TABLE `dwd_dim_date_info_tmp`(

`date_id` string COMMENT ' 日 ',

`week_id` string COMMENT ' 周 ',

`week_day` string COMMENT ' 周的第几天 ',

`day` string COMMENT ' 每月的第几天 ',

`month` string COMMENT ' 第几月 ',

`quarter` string COMMENT ' 第几季度 ',

`year` string COMMENT ' 年 ',

`is_workday` string COMMENT ' 是否是周末 ',

`holiday_id` string COMMENT ' 是否是节假日 '

) COMMENT ' 时间临时表 '

row format delimited fields terminated by '\t'

location '/warehouse/gmall/dwd/dwd_dim_date_info_tmp/';

（ 2 ）将数据导入临时表

hive (gmall)>

load data

local

inpath '/opt/module/db_log/date_info.txt' into table

dwd_dim_date_info_tmp;

（ 3 ）将数据导入正式表

hive (gmall)>

insert overwrite table dwd_dim_date_info select * from dwd_dim_date_info_tmp;

4 ）查询加载结果

hive (gmall)> select * from dwd_dim_date_info;

这篇关于（37）DWD 层（业务数据）的文章就介绍到这儿，希望我们推荐的文章对编程师们有所帮助！

（37）DWD 层（业务数据）

相关文章

SpringBoot分段处理List集合多线程批量插入数据方式

PHP轻松处理千万行数据的方法详解

C#实现千万数据秒级导入的代码

MyBatis-plus处理存储json数据过程

GSON框架下将百度天气JSON数据转JavaBean

C# LiteDB处理时间序列数据的高性能解决方案

Java+AI驱动实现PDF文件数据提取与解析

MySQL中查询和展示LONGBLOB类型数据的技巧总结

使用SpringBoot+InfluxDB实现高效数据存储与查询

Java整合Protocol Buffers实现高效数据序列化实践