Skip to content

zdcourse/Lucifer

Repository files navigation

Lucifer

Sogou 数据分析(个人)

https://www.sogou.com/labs/resource/q.php https://blog.javachen.com/2014/06/09/hive-data-manipulation-language.html

作业要求

利用 hadoop 平台,将 sogou 数据进行分析,数据大小为 2.14G,记录数为 4500 万条,

基于 map-reduce 代码实现一种数据挖掘或机器学习方法,例如聚类、分类、异常检测等,完成数据分析报告,并提交相关代码

使用 hive 对数据进行统计分析,完成数据分析报告,并提交相关代码

参考手册见附件,仅用于借鉴参考,分析更多的结果,并以图表化结果展示

分析报告,不少于 30 页,有完整的脚本或代码

2017.11.22 日晚上 12 点前班长统一收齐上交

操作步骤

一、数据预处理

看 PDF 文档

ts[\t]uid[\t]keyword[\t]rank[\t]order[\t]url[\t]

sudo -u hdfs hadoop fs -mkdir -p /sogou_ext/20111230
sudo -u hdfs hadoop fs -mkdir -p /sogou/20111230

cp /root/sogou.500w.utf8 /tmp
chown hdfs:hdfs /tmp/sogou.500w.utf8
sudo -u hdfs hadoop fs -put /tmp/sogou.500w.utf8 /sogou/20111230/

cp /root/sogou.500w.utf8.flt /tmp
chown hdfs:hdfs /tmp/sogou.500w.utf8.flt
sudo -u hdfs hadoop fs -put /tmp/sogou.500w.utf8.flt /sogou_ext/20111230/

二、基于 Hive 构建日志数据的数据仓库

输入 hive 命令,进入 hive 交互界面

# 查看数据库
show databases;
# 创建数据库
create database sogou;
# 使用数据库
use sogou;
# 查看所有表名
show tables;
# 创建外部表
create external table sogou.sogou_20111230(ts string,uid string,keyword string,rank int,sorder int,url string)Row FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as TEXTFILE location '/sogou/20111230'; # 创建数据库要小心关键字冲突,不能使用 date,order,user 等关键字。
# 查看新创建的表结构
show create table sogou.sogou_20111230;
describe sogou.sogou_20111230;
# 删除表
drop table sogou.sogou_20111230;

按照年,月,日,小时分区 创建扩展 4 个字段(年,月,日,小时)数据的外部表

create external table sogou.sogou_ext_20111230(ts string,uid string,keyword string,rank int,sorder int,url string,year int,month int,day int,hour int)row format delimited fields terminated by '\t' stored as textfile location '/sogou_ext/20111230';

创建带分区的表:

create external table sogou.sogou_partition(ts string,uid string,keyword string,rank int,sorder int,url string)partitioned by (year int,month int,day int,hour int) row format delimited fields terminated by '\t' stored as textfile;

灌入数据:

# 设置动态分区。nonstrict 全分区字段是动态的
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table sogou.sogou_partition partition(year,month,day,hour) select * from sogou.sogou_ext_20111230;

查询结果

select * from sogou_ext_20111230 limit 10;
select url from sogou_ext_20111230 limit 10;
select * from sogou_ext_20111230 where uid='96994a0480e7e1edcaef67b20d8816b7';

四、条数统计

数据总条数

select count(*) from sogou.sogou_ext_20111230

非空查询条数

select count(*) from sogou.sogou_ext_20111230 where keyword is not null and keyword !='';

无重复总条数

select count(*) from (select ts, uid, keyword, url
from sogou.sogou_ext_20111230
group by ts, uid, keyword, url having count(*)=1) a;

独立 UID 总数

select count(distinct(uid) from sogou.sogou_ext_20111230;

五、关键词分析

查询关键词长度统计

select avg(a.cnt)
from (select size(split(keyword, '\\s+')) as cnt
from sogou.sogou_ext_20111230) a;

查询频度排名(前 50)

select keyword, count(*) as cnt
from sogou.sogou_ext_20111230
group by keyword order by cnt desc limit 50;

六、UID 分析

UID 查询次数分布

select sum(if(uids.cnt=1, 1, 0)),sum(if(uids.cnt=2,1,0)), sum(if(uids.cnt=3,1,0)), sum(if(uids.cnt>3,1,0))
from
(select uid, count(*) as cnt from sogou.sogou_ext_20111230 group by uid) uids;

UID 平均查询次数

select sum(a.cnt)/count(a.uid) from (select uid, count(*) as cnt from sogou.sogou_ext_20111230 group by uid) a;

查询次数大于 2 次的用户总数

select count(a.uid) from (select uid, count(*) as cnt from sogou.sogou_ext_20111230 group by uid having cnt > 2) a;

查询次数大于 2 次的用户占比

-- uid 总数, 结果为 A
select count(distinct (uid)) from sogou.sogou_ext_20111230;

-- 2 次总用户,结果为 B
select count(a.uid) from (select uid, count(*) as cnt from sogou.sogou_ext_20111230 group by uid having cnt > 2) a;

-- B/A

查询次数大于 2 次的数据展示

select b.* from
(select uid, count(*) as cnt from sogou.sogou_ext_20111230 group by uid having cnt > 2) a
join sogou.sogou_ext_20111230 b on a.uid=b.uid limit 50;

七、用户行为分析

点击次数与 Rank 之间的关系

select count(*) from sogou.sogou_ext_20111230 where rank < 11;
select count(*) from sogou.sogou_ext_20111230;

直接输入 URL 作为查询词的比例

-- A
select count(*) from sogou.sogou_ext_20111230 where keyword like '%www%';

-- B
select count(*) from sogou.sogou_ext_20111230;

-- A/B

直接输入 URL 的查询中,点击数点击的结果就是用户输入的 URL 的网址 所占的比例 C

-- C
select sum(if(instr(url, keyword)>0,1,0)) from (select * from sogou.sogou_ext_20111230 where keyword like '%www%') a;

-- C / A

独立用户行为分析

select uid,count(*) as cnt from sogou.sogou_ext_20111230  where keyword='仙剑奇侠传' group by uid having cnt > 3;

查找 uid 是 653d48aa356d5111ac0e59f9fe736429 和 e11c6273e337c1d1032229f1b2321a75 的相关搜索记录

select * from sogou.sogou_ext_20111230 where uid='653d48aa356d5111ac0e59f9fe736429' and keyword like '%仙剑奇侠传%';
select * from sogou.sogou_ext_20111230 where uid='e11c6273e337c1d1032229f1b2321a75' and keyword like '%仙剑奇侠传%';

八、实现数据分析需求五:实时数据

每个 UID 在当天的查询点击次数

创建临时表

create table sogou.uid_cnt(uid STRING, cnt INT) COMMENT 'This is the sogou search data of one day'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;

查询并插入

INSERT OVERWRITE TABLE sogou.uid_cnt select uid,count(*) as cnt from sogou.sogou_ext_20111230 group by uid;

九、使用 Sqoop 将数据导入 Mysql

show databases;
create database test;

create table uid_cnt (
    uid varchar(255) default null,
    cnt int(11) default null
) engine=InnoDB default CHARSET=utf8;

导入数据

sqoop export --connect jdbc:mysql:https://localhost:3306/test --username root --password cloudera --table uid_cnt --export-dir '/user/hive/warehouse/sogou.db/uid_cnt' --fields-terminated-by '\t'

十、HBase Shell 操作实验

hbase shell
list
create 'test', {NAME => 'f1', VERSIONS => 5}
list
put 'test', 'aid001', 'f1:uid', '001'
scan 'test'
get 'test', 'aid001'
describe 'test'
disable 'test'
alter 'test', NAME => 'f1', VERSIONS => 3
enable 'test'

describe 'test'
truncate 'test'
scan 'test'
disable 'test'
drop 'test'
list
create 'uid_cnt', {NAME => 'f1', VERSIONS => 5}

十一、使用 Sqoop 将数据导入到 HBase

sqoop import --connect jdbc:mysql:https://localhost:3306/test --username root --password cloudera --table uid_cnt --hbase-table uid_cnt --column-family f1 --hbase-row-key uid --hbase-create-table -m 1

十二、HBase Java API 访问统计数据

# 将之前的 uid_cnt 数据从 HDFS 复制到本地
hdfs dfs -get /user/hive/warehouse/sogou.db/uid_cnt .
cat ~/uid_cnt/00000* > uid_cnt.output
hadoop jar hbase-example.jar HBaseImportTest /home/zkpk/uid_cnt/uid_cnt.output

十三、Mahout 聚类实验

wget https://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
hdfs dfs -mkdir testdata
hdfs dfs -put synthetic_control.data testdata/
hdfs dfs -ls testdata
hadoop jar mahout-examples-0.9-job.jar org.apache.mahout.clustering.syntheticcontrol.kmeans.Job
hdfs dfs -ls output

Releases

No releases published

Packages

No packages published