文章目录
- 用户日志
- 用处
- 日志生成渠道
- 日志内容
- 意义
- 离线数据处理架构
- 分析日志
- 引入解析UserAgent
- 单体实现
- hadoop-MapReduce实现
- 效果图
- 总结
用户日志
用处
分析行为 推荐
日志生成渠道
服务端Ngnix统计 前端统计Ajax
日志内容
意义
判断购买的推广转化率,及时调整不同区域的投资推广 判断页面包含的子页面是否合理,路径是否合理转化率 分析日志,营销经费合理分配
离线数据处理架构
代码语言:javascript复制数据处理流程
1)数据采集
Flume: web日志写入到HDFS
2)数据清洗
脏数据
Spark、Hive、MapReduce 或者是其他的一些分布式计算框架
清洗完之后的数据可以存放在HDFS(Hive/Spark SQL)
3)数据处理
按照我们的需要进行相应业务的统计和分析
Spark、Hive、MapReduce 或者是其他的一些分布式计算框架
4)处理结果入库
结果可以存放到RDBMS、NoSQL
5)数据的可视化
通过图形化展示的方式展现出来:饼图、柱状图、地图、折线图
ECharts、HUE、Zeppelin
分析日志
可获得到信息 地点、时间、访问设备、访问次数
代码语言:javascript复制UserAgent
hadoop jar /home/hadoop/lib/hadoop-train-1.0-jar-with-dependencies.jar com.imooc.hadoop.project.LogApp /10000_access.log /browserout
引入解析UserAgent
代码语言:javascript复制// 下载克隆
git clone https://github.com/LeeKemp/UserAgentParser.git
// 打包
mvn clean package -DskipTests
// 去target找到该文件jar
cd target/
UserAgentParser-0.0.1.jar
// 后退cd ..重新打包到本地的jar,让idea识别可用
mvn clean install -DskipTest
// 检验本地jar包生成
(base) bennyrhysdeMacBook-Pro:UserAgentParser bennyrhys$ cd /Users/bennyrhys/.m2/repository/com/kumkee/UserAgentParser/0.0.1/
(base) bennyrhysdeMacBook-Pro:0.0.1 bennyrhys$ ls
UserAgentParser-0.0.1.jar UserAgentParser-0.0.1.pom _remote.repositories
引入pom本地依赖
处理10000条数据的100条
代码语言:javascript复制// 提取100条数据
head -n 100 10000_access.log > 100_access.log
// 检测数据条数
wc -l 100_access.log
单体实现
UserAgentTest
代码语言:javascript复制package com.bennyrhys.hadoop.project;
import com.kumkee.userAgent.UserAgent;
import com.kumkee.userAgent.UserAgentParser;
import org.apache.commons.lang.StringUtils;
import org.junit.Test;
import java.io.*;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Author bennyrhys
* @Date 1/16/21 6:37 PM
* UserAgent测试类
*/
public class UserAgentTest {
// public static void main(String[] args) {
/**
* 读取文件,根据指定区间,统计浏览器信息
* @throws IOException
*/
@Test
public void testReadFile() throws IOException {
String path = "/Users/bennyrhys/Desktop/spark-practice/100_access.log";
BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream(new File(path)))
);
String line = "";
HashMap<String, Integer> browserMap = new HashMap<>();
UserAgentParser userAgentParser = new UserAgentParser();
while (line != null) {
line = reader.readLine(); // 一次读入一行数据
if (StringUtils.isNotBlank(line)) {
String source = line.substring(getCharacterPosition(line, """, 7) 1);
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();
String engine = agent.getEngine();
String engineVersion = agent.getEngineVersion();
String os = agent.getOs();
String platform = agent.getPlatform();
String version = agent.getVersion();
boolean mobile = agent.isMobile();
System.out.println(browser " " engine " " engineVersion " " os " " platform " " version " " mobile);
Integer browserCount = browserMap.get(browser);
if (browserCount != null) {
browserMap.put(browser, browserCount 1);
}else {
browserMap.put(browser, 1);
}
}
}
for (Map.Entry<String , Integer> entry : browserMap.entrySet()) {
System.out.println(entry.getKey() " : " entry.getValue());
}
}
/**
* 测试标识符的定位方法
*/
@Test
public void testGetCharacterPosition() {
String value = "183.162.52.7 - - [10/Nov/2016:00:01:02 0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.imooc.com" "-" cid=0×tamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027";
int index = getCharacterPosition(value, """, 7);
System.out.println(index);
}
/**
* 获取字符串中的指定标识字符串出现的索引位置
*/
private int getCharacterPosition(String value, String operator, int index) {
Matcher matcher = Pattern.compile(operator).matcher(value);
int mIdx = 0;
while (matcher.find()) {
mIdx ;
if (mIdx == index) {
break;
}
}
return matcher.start();
}
/**
* 单元测试:UserAgent 工具类的使用
*/
@Test
public void UserAgentParserT1() {
// String source = "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G";
String source = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
UserAgentParser userAgentParser = new UserAgentParser();
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();
String engine = agent.getEngine();
String engineVersion = agent.getEngineVersion();
String os = agent.getOs();
String platform = agent.getPlatform();
String version = agent.getVersion();
boolean mobile = agent.isMobile();
System.out.println(browser " " engine " " engineVersion " " os " " platform " " version " " mobile);
}
// }
}
hadoop-MapReduce实现
java
代码语言:javascript复制package com.bennyrhys.hadoop.project;
import com.bennyrhys.hadoop.mapreduce.WordCount2App;
import com.kumkee.userAgent.UserAgent;
import com.kumkee.userAgent.UserAgentParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Author bennyrhys
* @Date 1/18/21 11:21 AM
* 使用MapReduce完成需求:通过日志统计浏览器访问次数
*/
public class LogApp {
/**
* Map:读取输入的文件
*/
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
LongWritable one = new LongWritable(1);
private UserAgentParser userAgentParser;
/**
* setup,只初始化一次
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
userAgentParser = new UserAgentParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 接收到的每一行数据:其实就是一行日志信息
String line = value.toString();
String source = line.substring(getCharacterPosition(line, """, 7) 1);
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();
// 通过上下文把map的处理结果输出
context.write(new Text(browser), one);
}
// gc的时候回收
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
userAgentParser = null;
}
}
/**
* 获取字符串中的指定标识字符串出现的索引位置
*/
private static int getCharacterPosition(String value, String operator, int index) {
Matcher matcher = Pattern.compile(operator).matcher(value);
int mIdx = 0;
while (matcher.find()) {
mIdx ;
if (mIdx == index) {
break;
}
}
return matcher.start();
}
/**
* Reduce:归并操作
*/
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum = 0;
for(LongWritable value : values) {
// 求key出现的次数总和
sum = value.get();
}
// 最终统计结果的输出
context.write(key, new LongWritable(sum));
}
}
/**
* 定义Driver:封装了MapReduce作业的所有信息
*/
public static void main(String[] args) throws Exception {
//创建Configuration
Configuration configuration = new Configuration();
// 准备清理已存在的输出目录
Path outputPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(configuration);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath, true);
System.out.println("output file exists, but is has deleted");
}
//创建Job
Job job = Job.getInstance(configuration, "LogApp");
//设置job的处理类
job.setJarByClass(LogApp.class);
//设置作业处理的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//设置map相关参数
job.setMapperClass(LogApp.MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置reduce相关参数
job.setReducerClass(LogApp.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置作业处理的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
pom.xml注意插件 生成包含本地的jar
代码语言:javascript复制<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bennyrhys.hadoop</groupId>
<artifactId>hdfs-api</artifactId>
<version>1.0-SNAPSHOT</version>
<name>hdfs-api</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
</properties>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<!-- 本地jar包生成:UserAgentParser解析的依赖-->
<dependency>
<groupId>com.kumkee</groupId>
<artifactId>UserAgentParser</artifactId>
<version>0.0.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<!-- mvn assembly:assembly-->
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
</project>
代码语言:javascript复制-rw-r--r-- 1 bennyrhys staff 34246 Jan 18 19:36 hdfs-api-1.0-SNAPSHOT-jar-with-dependencies.jar
-rw-r--r-- 1 bennyrhys staff 24436 Jan 18 19:36 hdfs-api-1.0-SNAPSHOT.jar
hadoop 启动hdfs和yarn 上传lib jar,data 日志文件 写启动脚本shell log.sh
代码语言:javascript复制hadoop fs -rm -r /browserout
hadoop jar /root/lib/hdfs-api-1.0-SNAPSHOT-jar-with-dependencies.jar com.bennyrhys.hadoop.project.LogApp hdfs://hadoop01:9000/10000_access.log hdfs://hadoop01:9000/browserout
效果图
总结
代码语言:javascript复制用户行为日志:用户每次访问网站时所有的行为数据(访问、浏览、搜索、点击...)
用户行为轨迹、流量日志
日志数据内容:
1)访问的系统属性: 操作系统、浏览器等等
2)访问特征:点击的url、从哪个url跳转过来的(referer)、页面上的停留时间等
3)访问信息:session_id、访问ip(访问城市)等
2013-05-19 13:00:00 http://www.taobao.com/17/?tracker_u=1624169&type=1 B58W48U4WKZCJ5D1T3Z9ZY88RU7QA7B1 http://hao.360.cn/ 1.196.34.243
数据处理流程
1)数据采集
Flume: web日志写入到HDFS
2)数据清洗
脏数据
Spark、Hive、MapReduce 或者是其他的一些分布式计算框架
清洗完之后的数据可以存放在HDFS(Hive/Spark SQL)
3)数据处理
按照我们的需要进行相应业务的统计和分析
Spark、Hive、MapReduce 或者是其他的一些分布式计算框架
4)处理结果入库
结果可以存放到RDBMS、NoSQL
5)数据的可视化
通过图形化展示的方式展现出来:饼图、柱状图、地图、折线图
ECharts、HUE、Zeppelin
UserAgent
hadoop jar /home/hadoop/lib/hadoop-train-1.0-jar-with-dependencies.jar com.imooc.hadoop.project.LogApp /10000_access.log /browserout