Spark 单击开发
- Java JDK、Maven、Scala、spark解压
- .bash_profile 配置路径
-
maven 创建新项目
mvn archetype:generate -DgroupId=com.nomadli.app -DartifactId=app -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
- vscode 打开->命令面板->过滤task->Configure Task Runner->maven 生成tasks.json
- vscode 命令面板->过滤task->Run Test Task
docker spark + hadoop 集群配置
- 设置docker仓库地址 https://docker.mirrors.ustc.edu.cn
- 启动centos容器
-
安装基础服务
curl http://mirrors.aliyun.com/repo/Centos-6.repo > /etc/yum.repos.d/CentOS-Base-6-aliyun.repo mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak yum makecache yum install -y net-tools which openssh-clients openssh-server iproute.x86_64 wget
-
禁止ssh PAM认证机制
sed -i 's/UsePAM yes/UsePAM no/g' /etc/ssh/sshd_config service sshd restart
-
生成不对称密钥
ssh-keygen -t rsa -P '' ~/.ssh/id_dsa cat ~/.ssh/id_dsa.pub >> authorized_keys
- docker cp jdk1.8 hadoop2.8 sspark2.2.0 scal 到容器
-
安装jdk并获取安装目录、加环境变量
rpm -ivh jdk-8...-Liunx-x64.rpm update-alternatives --config java vi ~/.bashrc export JAVA_HOME=/user/Java/jdk1.8.0_144/ export Path=$JAVA_HOME:$PATH
- 解压scala到~/.local export SCALA_HOME=~/.local/scala-2.12.3
-
hadoop安装 ~/.bashrc
tar xvzf hadoop-2.8.0.tar.gz ~/.local export HADOOP_HOME=/root/.local/hadoop-2.8.0 export HADOOP_CONFIG_HOME=$HADOOP_HOME/etc/hadoop export PATH=$PATH:$HADOOP_HOME/bin export PATH=$PATH:$HADOOP_HOME/sbin export SPARK_DIST_CLASSPATH=$(hadoop classpath) cd $HADOOP_HOME mkdir tmp namenode datanode vi $HADOOP_CONFIG_HOME/core-site.xml <configuration> <property> <name>hadoop.tmp.dir</name> <value>/root/.local/hadoop-2.8.0/tmp</value> <description>零时目录</description> </property>
<name>fs.default.name</name> <value>hdfs://master:9000</value> <final>true</final> <description>主节点</description> </property> </configuration> vi $HADOOP_CONFIG_HOME/hdfs-site.xml <configuration> <property> <name>dfs.replication</name> <value>2</value> <final>true</final> <description>子节点数量</description> </property>
<name>dfs.namenode.name.dir</name> <value>/root/.local/hadoop-2.8.0/namenode</value> <final>true</final> </property>
<name>dfs.datanode.data.dir</name> <value>/root/.local/hadoop-2.8.0/datanode</value> <final>true</final> </property> </configuration> vi $HADOOP_CONFIG_HOME/mapred-site.xml <configuration> <property> <name>mapred.job.tracker</name> <value>master:9001</value> <description>mapreduce监控点</description> </property> </configuration> vi $HADOOP_CONFIG_HOME/yarn-site.xml <configuration> <property> <name>yarn.nodemanager.vmem-check-enabled</name> <value>false</value> <description>虚内存警告</description> </property> </configuration> ~/.local/hadoop-2.8.0/bin/hadoop namenode -format
-
安装spark 解压~/.local
vi ~/.local/spark/spark-env.sh export SCALA_HOME=/root/.local/scala-2.12.3/ export JAVA_HOME=/usr/java/jdk1.8.0_144/ export HADOOP_HOME=/root/.local/hadoop-2.8.0 export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop SPARK_MASTER_IP=master SPARK_LOCAL_DIRS=/root/.local/spark-2.2.0-bin-without-hadoop SPARK_DRIVER_MEMORY=1G vi slaves add slave01 slave02//两台节点的例子
-
保存镜像
docker commit -m "..." name centos:spark-hadoop
-
使用镜像启动三个容器 50070(hadoop) 8088(yarn) 8080(spark)
docker run -itd -P -p 50070:50070 -p 8088:8088 -p 8080:8080 --name master -h master --add-host slave01:172.17.0.4 --add-host slave02:172.17.05 centos:spark-hadoop docker run -itd -P --name slave01 -h slave01 --add-host master:172.17.0.3 --add-host slave02:172.17.0.5 centos:spark-hadoop docker run -itd -P --name slave02 -h slave02 --add-host master:172.17.0.3 --add-host slave01:172.17.0.4 centos:spark-hadoop
-
进入master 启动hadoop和spark集群
docker exec -it master /bing/bash cd ~/.local/hadoop-../sbin sh start-all.sh cd ~/.local/spark-.../sbin sh start-all.sh