Spark

Spark

Posted by nomadli on September 22, 2017

Spark 单击开发

  1. Java JDK、Maven、Scala、spark解压
  2. .bash_profile 配置路径
  3. maven 创建新项目

    mvn archetype:generate 
    -DgroupId=com.nomadli.app
    -DartifactId=app
    -DarchetypeArtifactId=maven-archetype-quickstart
    -DinteractiveMode=false
    
  4. vscode 打开->命令面板->过滤task->Configure Task Runner->maven 生成tasks.json
  5. vscode 命令面板->过滤task->Run Test Task

docker spark + hadoop 集群配置

  1. 设置docker仓库地址 https://docker.mirrors.ustc.edu.cn
  2. 启动centos容器
  3. 安装基础服务

     curl http://mirrors.aliyun.com/repo/Centos-6.repo > /etc/yum.repos.d/CentOS-Base-6-aliyun.repo
     mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak
     yum makecache
     yum install -y net-tools which openssh-clients openssh-server iproute.x86_64 wget
    
  4. 禁止ssh PAM认证机制

     sed -i 's/UsePAM yes/UsePAM no/g' /etc/ssh/sshd_config
     service sshd restart
    
  5. 生成不对称密钥

     ssh-keygen -t rsa -P '' ~/.ssh/id_dsa
     cat ~/.ssh/id_dsa.pub >> authorized_keys
    
  6. docker cp jdk1.8 hadoop2.8 sspark2.2.0 scal 到容器
  7. 安装jdk并获取安装目录、加环境变量

     rpm -ivh jdk-8...-Liunx-x64.rpm
     update-alternatives --config java
     vi ~/.bashrc
     export JAVA_HOME=/user/Java/jdk1.8.0_144/
     export Path=$JAVA_HOME:$PATH
    
  8. 解压scala到~/.local export SCALA_HOME=~/.local/scala-2.12.3
  9. hadoop安装 ~/.bashrc

     tar xvzf hadoop-2.8.0.tar.gz ~/.local
     export HADOOP_HOME=/root/.local/hadoop-2.8.0
     export HADOOP_CONFIG_HOME=$HADOOP_HOME/etc/hadoop
     export PATH=$PATH:$HADOOP_HOME/bin
     export PATH=$PATH:$HADOOP_HOME/sbin
     export SPARK_DIST_CLASSPATH=$(hadoop classpath)
     cd $HADOOP_HOME
     mkdir tmp namenode datanode
    	
     vi $HADOOP_CONFIG_HOME/core-site.xml
     <configuration>
     		<property>
       	<name>hadoop.tmp.dir</name>
         	<value>/root/.local/hadoop-2.8.0/tmp</value>
         	<description>零时目录</description>
     		</property>
    
         	<name>fs.default.name</name>
         	<value>hdfs://master:9000</value>
         	<final>true</final>
         	<description>主节点</description>
     		</property>
     </configuration>
    	
     vi $HADOOP_CONFIG_HOME/hdfs-site.xml
     <configuration>
     		<property>
     		<name>dfs.replication</name>
     		<value>2</value>
     		<final>true</final>
     		<description>子节点数量</description>
     		</property>
    
     		<name>dfs.namenode.name.dir</name>
     		<value>/root/.local/hadoop-2.8.0/namenode</value>
     		<final>true</final>
     		</property>
    
     		<name>dfs.datanode.data.dir</name>
     		<value>/root/.local/hadoop-2.8.0/datanode</value>
     		<final>true</final>
     		</property>
     </configuration>
    	
     vi $HADOOP_CONFIG_HOME/mapred-site.xml
     <configuration>
     		<property>
     		<name>mapred.job.tracker</name>
     		<value>master:9001</value>
     		<description>mapreduce监控点</description>
     		</property>
     </configuration>
    	
     vi $HADOOP_CONFIG_HOME/yarn-site.xml
     <configuration>
     		<property>
     		<name>yarn.nodemanager.vmem-check-enabled</name>
     		<value>false</value>
     		<description>虚内存警告</description>
     		</property>
     </configuration>
    	
     ~/.local/hadoop-2.8.0/bin/hadoop namenode -format
    
  10. 安装spark 解压~/.local

    vi ~/.local/spark/spark-env.sh
    export SCALA_HOME=/root/.local/scala-2.12.3/
    export JAVA_HOME=/usr/java/jdk1.8.0_144/
    export HADOOP_HOME=/root/.local/hadoop-2.8.0
    export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
    SPARK_MASTER_IP=master
    SPARK_LOCAL_DIRS=/root/.local/spark-2.2.0-bin-without-hadoop
    SPARK_DRIVER_MEMORY=1G
    	
    vi slaves add slave01 slave02//两台节点的例子
    
  11. 保存镜像

    docker commit -m "..." name centos:spark-hadoop
    
  12. 使用镜像启动三个容器 50070(hadoop) 8088(yarn) 8080(spark)

    docker run -itd -P -p 50070:50070 -p 8088:8088 -p 8080:8080 --name master -h master --add-host slave01:172.17.0.4 --add-host slave02:172.17.05 centos:spark-hadoop
    docker run -itd -P --name slave01 -h slave01 --add-host master:172.17.0.3 --add-host slave02:172.17.0.5 centos:spark-hadoop
    docker run -itd -P --name slave02 -h slave02 --add-host master:172.17.0.3 --add-host slave01:172.17.0.4 centos:spark-hadoop
    
  13. 进入master 启动hadoop和spark集群

    docker exec -it master /bing/bash
    cd ~/.local/hadoop-../sbin
    sh start-all.sh
    cd ~/.local/spark-.../sbin
    sh start-all.sh