DS:1.3.4
Ambari:2.6.3.0-235
二、python测试代码,dspythontest.pyimport os
import pandas as pd
from pyspark.sql import SparkSession
from argparse import ArgumentParser
spark = SparkSession
.builder
.appName("ds python test")
.enableHiveSupport()
.getOrCreate()
def main(args):
coc = ['red', 'rellow']
c_df = pd.Dataframe(coc,columns=['color'])
c_df['len']=c_df['color'].apply(len)
c_df = spark.createDataframe(c_df)
c_df.show()
print(args.arch)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("--arch", default='bert', type=str)
args = parser.parse_args()
main(args)
三、操作步骤
1、依赖环境上传到yarn
(1)将python依赖环境包pythonenv放到/home下;
(2)进入/home/pythonenv文件夹下将环境打包成zip待用:
zip -r pythonenv.zip ./*2、将用到的python程序和配置文件传到DS资源中心 3、在DS项目管理中新建spark工作流
说明: 程序类型要选择python类型; 其他参数中前两行是将python依赖环境同步上传到yarn,#pythonenv不能省略,指的是将zip包解压到的文件夹; 其他参数的--files上传的是hive配置文件,由于ambari环境的此配置文件与程序有冲突,故修改配置重新上传此配置文件; 其他参数的最后一行末尾一定不能加 ,否则DS会报错不存在dspythontest.py文件; pyspark程序提交yarn集群,不需要在ds的dolphinscheduler_env.sh中配置python环境也可以运行,因为python所依赖环境已经经DS上传到yarn上了;三、遇到的问题
1、资源文件不存在问题
Exception in thread "main" java.io.FileNotFoundException: File file:/tmp/dolphinscheduler/exec/process/8/67/268/276/ dspythontest.py does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:624)
at org.apache.hadoop.fs.RawLocalFileSystem.getFilelinkStatusInternal(RawLocalFileSystem.java:850)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:614)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:422)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:340)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:292)
at org.apache.spark.deploy.yarn.Client.copyFileToRemote(Client.scala:357)
at org.apache.spark.deploy.yarn.Client.org$apache$spark$deploy$yarn$Client$$distribute$1(Client.scala:476)
at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:630)
at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:845)
at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:170)
at org.apache.spark.deploy.yarn.Client.run(Client.scala:1174)
at org.apache.spark.deploy.yarn.Client$.main(Client.scala:1233)
at org.apache.spark.deploy.yarn.Client.main(Client.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:782)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
出现此问题的原因是在其他参数的最后一行多加了 ,导致运行命令为:
${SPARK_HOME2}/bin/spark-submit --master yarn --deploy-mode cluster --driver-cores 1 --driver-memory 512M --num-executors 2 --executor-cores 2 --executor-memory 2G --queue default --archives=hdfs:///pyenv/pythonenv.zip#pythonenv
--conf spark.pyspark.python=./pythonenv/env37/bin/python3 dspythontest.py --arch 123
2、yarn日志报hive错误
Please make sure that jars for your version of hive and hadoop are included in the paths passed to spark.sql.hive.metastore.jars.
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:274)
at org.apache.spark.sql.hive.HiveUtils$.newClientFormetadata(HiveUtils.scala:362)
at org.apache.spark.sql.hive.HiveUtils$.newClientFormetadata(HiveUtils.scala:266)
at org.apache.spark.sql.hive.HiveExternalCatalog.client$lzycompute(HiveExternalCatalog.scala:66)
at org.apache.spark.sql.hive.HiveExternalCatalog.client(HiveExternalCatalog.scala:65)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply$mcZ$sp(HiveExternalCatalog.scala:194)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:194)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:194)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:193)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:105)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:93)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.externalCatalog(HiveSessionStateBuilder.scala:39)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog$lzycompute(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog(HiveSessionStateBuilder.scala:52)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog(HiveSessionStateBuilder.scala:35)
at org.apache.spark.sql.internal.baseSessionStateBuilder.build(baseSessionStateBuilder.scala:289)
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1071)
... 16 more
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:268)
... 33 more
Caused by: java.lang.NoClassDefFoundError: org/apache/tez/dag/api/SessionNotRunning
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:529)
at org.apache.spark.sql.hive.client.HiveClientImpl.(HiveClientImpl.scala:192)
... 38 more
Caused by: java.lang.ClassNotFoundException: org.apache.tez.dag.api.SessionNotRunning
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.doLoadClass(IsolatedClientLoader.scala:225)
at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.loadClass(IsolatedClientLoader.scala:214)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 40 more
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "dspythontest.py", line 9, in
.appName("ds python test")
File "/home/hadoop/yarn/local/usercache/hdfs/appcache/application_1642127047517_0110/container_e09_1642127047517_0110_01_000001/pyspark.zip/pyspark/sql/session.py", line 179, in getOrCreate
File "/home/hadoop/yarn/local/usercache/hdfs/appcache/application_1642127047517_0110/container_e09_1642127047517_0110_01_000001/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/home/hadoop/yarn/local/usercache/hdfs/appcache/application_1642127047517_0110/container_e09_1642127047517_0110_01_000001/pyspark.zip/pyspark/sql/utils.py", line 79, in deco
pyspark.sql.utils.IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"
此错误出现在ambari环境的hive-site.xml上,需要将hive-site.xml的如下配置删除,并跟随程序重新上传即可;
四、运行成功标志hive.execution.engine tez



