步骤1:使用Python链接Spark环境步骤2:创建dateframe数据
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession
.builder
.appName('pyspark')
.getOrCreate()
# 原始数据
test = spark.createDataframe([('001','1',100,87,67,83,98), ('002','2',87,81,90,83,83), ('003','3',86,91,83,89,63),
('004','2',65,87,94,73,88), ('005','1',76,62,89,81,98), ('006','3',84,82,85,73,99),
('007','3',56,76,63,72,87), ('008','1',55,62,46,78,71), ('009','2',63,72,87,98,64)], ['number','class','language','math','english','physic','chemical'])
test.show()
/usr/local/lib/python3.6/site-packages/pyspark/context.py:238: FutureWarning: Python 3.6 support is deprecated in Spark 3.2. FutureWarning +------+-----+--------+----+-------+------+--------+ |number|class|language|math|english|physic|chemical| +------+-----+--------+----+-------+------+--------+ | 001| 1| 100| 87| 67| 83| 98| | 002| 2| 87| 81| 90| 83| 83| | 003| 3| 86| 91| 83| 89| 63| | 004| 2| 65| 87| 94| 73| 88| | 005| 1| 76| 62| 89| 81| 98| | 006| 3| 84| 82| 85| 73| 99| | 007| 3| 56| 76| 63| 72| 87| | 008| 1| 55| 62| 46| 78| 71| | 009| 2| 63| 72| 87| 98| 64| +------+-----+--------+----+-------+------+--------+
# 步骤3:用spark执行以下逻辑:找到数据行数、列数
print(f'There are {len(test.columns)} columns and {test.count()} rows in this dataframe.')
There are 7 columns and 9 rows in this dataframe.
# 步骤4:用spark筛选class为1的样本
test.where("class == 1").collect()
[Row(number='001', class='1', language=100, math=87, english=67, physic=83, chemical=98), Row(number='005', class='1', language=76, math=62, english=89, physic=81, chemical=98), Row(number='008', class='1', language=55, math=62, english=46, physic=78, chemical=71)]
# 步骤5:用spark筛选language >90 或 math> 90的样本
test.where("language > 90 or math > 90").collect()
[Row(number='001', class='1', language=100, math=87, english=67, physic=83, chemical=98), Row(number='003', class='3', language=86, math=91, english=83, physic=89, chemical=63)]



