CDA持证人阿涛哥

2022-12-30   阅读量: 277

Mysql

mysql 学习41--数据清洗

扫码加入数据分析学习群

-- 淘宝案例数据准备

create database taobao1123;

use taobao1123;

create table UserBehavior(

user_id int,

item_id int,

item_category int,

behavior_type varchar(10),

user_geohash varchar(10),

times datetime,

amount decimal(5,2)

);

load data infile "C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/UserBehavior.csv"

into table UserBehavior

fields terminated by ',' ignore 1 lines;

select * from UserBehavior limit 10;

select count(*) from UserBehavior;#1048575


-- 1,数据清洗

-- 1.1 缺失值处理

###1.1.1缺失值数量

select

count(user_id),

count(item_id),

count(item_category),

count(behavior_type),

count(user_geohash),

count(times),

count(amount)

from UserBehavior;

###1.1.2缺失值比例

select

sum(user_id is null )/count(*),

sum(item_id is null)/count(*),

sum(item_category is null)/count(*),

sum(behavior_type is null)/count(*),

sum(user_geohash is null)/count(*),

sum(times is null)/count(*),

sum(amount is null)/count(*)

from UserBehavior;



-- 1.2 异常值检查

select min(times),max(times),min(amount),max(amount) from UserBehavior;


-- 1.3 重复记录处理

select distinct * from UserBehavior;


0.0000 1 0 关注作者 收藏

评论(0)


暂无数据

推荐课程