Skip to main content

Progress Checklist for PySpark SQL

Core Classes

FunctionsSupported ✅?
SparkSession(sparkContext[, jsparkSession])
DataFrame(jdf, sql_ctx)
Column(jc)
Row
GroupedData(jgd, df)⬜️
PandasCogroupedOps(gd1, gd2)⬜️
DataFrameNaFunctions(df)⬜️
DataFrameStatFunctions(df)⬜️
Window⬜️

Spark Session APIs

FunctionsSupported ✅?
SparkSession.builder.config([key, value, conf])
SparkSession.builder.appName(name)
SparkSession.builder.enableHiveSupport()⬜️
SparkSession.builder.getOrCreate()
SparkSession.builder.master(master)⬜️
SparkSession.catalog⬜️
SparkSession.conf
SparkSession.createDataFrame(data[, schema, …])⬜️
SparkSession.getActiveSession()⬜️
SparkSession.newSession()⬜️
SparkSession.range(start[, end, step, …])⬜️
SparkSession.read
SparkSession.readStream⬜️
SparkSession.sparkContext
SparkSession.sql(sqlQuery)
SparkSession.stop()⬜️
SparkSession.streams⬜️
SparkSession.table(tableName)⬜️
SparkSession.udf⬜️
SparkSession.version

Configuration

FunctionsSupported ✅?
RuntimeConfig(jconf)⬜️

Input and Output

FunctionsSupported ✅?
DataFrameReader.csv(path[, schema, sep, …])
DataFrameReader.format(source)
DataFrameReader.jdbc(url, table[, column, …])⬜️
DataFrameReader.json(path[, schema, …])⬜️
DataFrameReader.load([path, format, schema])⬜️
DataFrameReader.option(key, value)⬜️
DataFrameReader.options(**options)⬜️
DataFrameReader.orc(path[, mergeSchema, …])⬜️
DataFrameReader.parquet(*paths, **options)
DataFrameReader.schema(schema)
DataFrameReader.table(tableName)⬜️
DataFrameWriter.bucketBy(numBuckets, col, *cols)⬜️
DataFrameWriter.csv(path[, mode, …])
DataFrameWriter.format(source)⬜️
DataFrameWriter.insertInto(tableName[, …])⬜️
DataFrameWriter.jdbc(url, table[, mode, …])⬜️
DataFrameWriter.json(path[, mode, …])⬜️
DataFrameWriter.mode(saveMode)⬜️
DataFrameWriter.option(key, value)⬜️
DataFrameWriter.options(**options)⬜️
DataFrameWriter.orc(path[, mode, …])⬜️
DataFrameWriter.parquet(path[, mode, …])
DataFrameWriter.partitionBy(*cols)⬜️
DataFrameWriter.save([path, format, mode, …])
DataFrameWriter.saveAsTable(name[, format, …])⬜️
DataFrameWriter.sortBy(col, *cols)
DataFrameWriter.text(path[, compression, …])⬜️

DataFrame APIs

FunctionsSupported ✅?
DataFrame.agg(*exprs)
DataFrame.alias(alias)⬜️
DataFrame.approxQuantile(col, probabilities, …)⬜️
DataFrame.cache()
DataFrame.checkpoint([eager])⬜️
DataFrame.coalesce(numPartitions)
DataFrame.colRegex(colName)⬜️
DataFrame.collect()
DataFrame.columns
DataFrame.corr(col1, col2[, method])⬜️
DataFrame.count()
DataFrame.cov(col1, col2)⬜️
DataFrame.createGlobalTempView(name)⬜️
DataFrame.createOrReplaceGlobalTempView(name)
DataFrame.createOrReplaceTempView(name)
DataFrame.createTempView(name)
DataFrame.crossJoin(other)⬜️
DataFrame.crosstab(col1, col2)⬜️
DataFrame.cube(*cols)⬜️
DataFrame.describe(*cols)⬜️
DataFrame.distinct()⬜️
DataFrame.drop(*cols)⬜️
DataFrame.dropDuplicates([subset])⬜️
DataFrame.drop_duplicates([subset])⬜️
DataFrame.dropna([how, thresh, subset])⬜️
DataFrame.dtypes⬜️
DataFrame.exceptAll(other)⬜️
DataFrame.explain([extended, mode])⬜️
DataFrame.fillna(value[, subset])⬜️
DataFrame.filter(condition)⬜️
DataFrame.first()⬜️
DataFrame.foreach(f)⬜️
DataFrame.foreachPartition(f)⬜️
DataFrame.freqItems(cols[, support])⬜️
DataFrame.groupBy(*cols)
DataFrame.head([n])⬜️
DataFrame.hint(name, *parameters)⬜️
DataFrame.inputFiles()⬜️
DataFrame.intersect(other)⬜️
DataFrame.intersectAll(other)⬜️
DataFrame.isLocal()⬜️
DataFrame.isStreaming⬜️
DataFrame.join(other[, on, how])
DataFrame.limit(num)⬜️
DataFrame.localCheckpoint([eager])⬜️
DataFrame.mapInPandas(func, schema)⬜️
DataFrame.na⬜️
DataFrame.orderBy(*cols, **kwargs)
DataFrame.persist([storageLevel])
DataFrame.printSchema()
DataFrame.randomSplit(weights[, seed])⬜️
DataFrame.rdd⬜️
DataFrame.registerTempTable(name)
DataFrame.repartition(numPartitions, *cols)⬜️
DataFrame.repartitionByRange(numPartitions, …)⬜️
DataFrame.replace(to_replace[, value, subset])⬜️
DataFrame.rollup(*cols)⬜️
DataFrame.sameSemantics(other)⬜️
DataFrame.sample([withReplacement, …])⬜️
DataFrame.sampleBy(col, fractions[, seed])⬜️
DataFrame.schema
DataFrame.select(*cols)
DataFrame.selectExpr(*expr)⬜️
DataFrame.semanticHash()⬜️
DataFrame.show([n, truncate, vertical])⬜️
DataFrame.sort(*cols, **kwargs)
DataFrame.sortWithinPartitions(*cols, **kwargs)⬜️
DataFrame.stat⬜️
DataFrame.storageLevel
DataFrame.subtract(other)⬜️
DataFrame.summary(*statistics)⬜️
DataFrame.tail(num)⬜️
DataFrame.take(num)⬜️
DataFrame.toDF(*cols)⬜️
DataFrame.toJSON([use_unicode])⬜️
DataFrame.toLocalIterator([prefetchPartitions])⬜️
DataFrame.toPandas()⬜️
DataFrame.transform(func)⬜️
DataFrame.union(other)⬜️
DataFrame.unionAll(other)⬜️
DataFrame.unionByName(other[, …])⬜️
DataFrame.unpersist([blocking])
DataFrame.where(condition)
DataFrame.withColumn(colName, col)⬜️
DataFrame.withColumnRenamed(existing, new)⬜️
DataFrame.withWatermark(eventTime, …)⬜️
DataFrame.write
DataFrame.writeStream⬜️
DataFrame.writeTo(table)⬜️
DataFrameNaFunctions.drop([how, thresh, subset])⬜️
DataFrameNaFunctions.fill(value[, subset])⬜️
DataFrameNaFunctions.replace(to_replace[, …])⬜️
DataFrameStatFunctions.approxQuantile(col, …)⬜️
DataFrameStatFunctions.corr(col1, col2[, method])
DataFrameStatFunctions.cov(col1, col2)⬜️
DataFrameStatFunctions.crosstab(col1, col2)⬜️
DataFrameStatFunctions.freqItems(cols[, support])⬜️
DataFrameStatFunctions.sampleBy(col, fractions)⬜️

Column APIs

FunctionsSupported ✅?
Column.alias(*alias, **kwargs)⬜️
Column.asc()⬜️
Column.asc_nulls_first()⬜️
Column.asc_nulls_last()⬜️
Column.astype(dataType)⬜️
Column.between(lowerBound, upperBound)⬜️
Column.bitwiseAND(other)⬜️
Column.bitwiseOR(other)⬜️
Column.bitwiseXOR(other)⬜️
Column.cast(dataType)⬜️
Column.contains(other)⬜️
Column.desc()⬜️
Column.desc_nulls_first()⬜️
Column.desc_nulls_last()⬜️
Column.dropFields(*fieldNames)⬜️
Column.endswith(other)⬜️
Column.eqNullSafe(other)⬜️
Column.getField(name)⬜️
Column.getItem(key)⬜️
Column.isNotNull()⬜️
Column.isNull()⬜️
Column.isin(*cols)⬜️
Column.like(other)
Column.name(*alias, **kwargs)⬜️
Column.otherwise(value)⬜️
Column.over(window)⬜️
Column.rlike(other)⬜️
Column.startswith(other)⬜️
Column.substr(startPos, length)
Column.when(condition, value)⬜️
Column.withField(fieldName, col)⬜️

Data Types

FunctionsSupported ✅?
ArrayType(elementType[, containsNull])⬜️
BinaryType⬜️
BooleanType⬜️
ByteType⬜️
DataType⬜️
DateType
DecimalType([precision, scale])⬜️
DoubleType
FloatType
IntegerType
LongType
MapType(keyType, valueType[, valueContainsNull])⬜️
NullType⬜️
ShortType
StringType⬜️
StructField(name, dataType[, nullable, metadata])⬜️
StructType([fields])⬜️
TimestampType⬜️

Row

FunctionsSupported ✅?
Row.asDict([recursive])⬜️

Functions

FunctionsSupported ✅?
abs(col)⬜️
acos(col)⬜️
acosh(col)⬜️
add_months(start, months)⬜️
aggregate(col, initialValue, merge[, finish])
approxCountDistinct(col[, rsd])⬜️
approx_count_distinct(col[, rsd])⬜️
array(*cols)⬜️
array_contains(col, value)⬜️
array_distinct(col)⬜️
array_except(col1, col2)⬜️
array_intersect(col1, col2)⬜️
array_join(col, delimiter[, null_replacement])⬜️
array_max(col)⬜️
array_min(col)⬜️
array_position(col, value)⬜️
array_remove(col, element)⬜️
array_repeat(col, count)⬜️
array_sort(col)⬜️
array_union(col1, col2)⬜️
arrays_overlap(a1, a2)⬜️
arrays_zip(*cols)⬜️
asc(col)⬜️
asc_nulls_first(col)⬜️
asc_nulls_last(col)⬜️
ascii(col)⬜️
asin(col)⬜️
asinh(col)⬜️
assert_true(col[, errMsg])⬜️
atan(col)⬜️
atanh(col)⬜️
atan2(col1, col2)⬜️
avg(col)
base64(col)⬜️
bin(col)⬜️
bitwiseNOT(col)⬜️
broadcast(df)⬜️
bround(col[, scale])⬜️
bucket(numBuckets, col)⬜️
cbrt(col)⬜️
ceil(col)⬜️
coalesce(*cols)⬜️
col(col)⬜️
collect_list(col)⬜️
collect_set(col)⬜️
column(col)
concat(*cols)⬜️
concat_ws(sep, *cols)⬜️
conv(col, fromBase, toBase)⬜️
corr(col1, col2)
cos(col)⬜️
cosh(col)⬜️
count(col)
countDistinct(col, *cols)⬜️
covar_pop(col1, col2)⬜️
covar_samp(col1, col2)⬜️
crc32(col)⬜️
create_map(*cols)⬜️
cume_dist()⬜️
current_date()⬜️
current_timestamp()⬜️
date_add(start, days)⬜️
date_format(date, format)⬜️
date_sub(start, days)⬜️
date_trunc(format, timestamp)⬜️
datediff(end, start)⬜️
dayofmonth(col)⬜️
dayofweek(col)⬜️
dayofyear(col)⬜️
days(col)⬜️
decode(col, charset)⬜️
degrees(col)⬜️
dense_rank()⬜️
desc(col)⬜️
desc_nulls_first(col)⬜️
desc_nulls_last(col)⬜️
element_at(col, extraction)⬜️
encode(col, charset)⬜️
exists(col, f)⬜️
exp(col)⬜️
explode(col)⬜️
explode_outer(col)⬜️
expm1(col)⬜️
expr(str)
factorial(col)⬜️
filter(col, f)⬜️
first(col[, ignorenulls])⬜️
flatten(col)⬜️
floor(col)⬜️
forall(col, f)⬜️
format_number(col, d)⬜️
format_string(format, *cols)⬜️
from_csv(col, schema[, options])⬜️
from_json(col, schema[, options])⬜️
from_unixtime(timestamp[, format])⬜️
from_utc_timestamp(timestamp, tz)⬜️
get_json_object(col, path)⬜️
greatest(*cols)⬜️
grouping(col)⬜️
grouping_id(*cols)⬜️
hash(*cols)⬜️
hex(col)⬜️
hour(col)⬜️
hours(col)⬜️
hypot(col1, col2)⬜️
initcap(col)⬜️
input_file_name()⬜️
instr(str, substr)⬜️
isnan(col)⬜️
isnull(col)⬜️
json_tuple(col, *fields)⬜️
kurtosis(col)⬜️
lag(col[, offset, default])⬜️
last(col[, ignorenulls])⬜️
last_day(date)⬜️
lead(col[, offset, default])⬜️
least(*cols)⬜️
length(col)⬜️
levenshtein(left, right)⬜️
lit(col)⬜️
locate(substr, str[, pos])⬜️
log(arg1[, arg2])⬜️
log10(col)⬜️
log1p(col)⬜️
log2(col)⬜️
lower(col)⬜️
lpad(col, len, pad)⬜️
ltrim(col)⬜️
map_concat(*cols)⬜️
map_entries(col)⬜️
map_filter(col, f)⬜️
map_from_arrays(col1, col2)⬜️
map_from_entries(col)⬜️
map_keys(col)⬜️
map_values(col)⬜️
map_zip_with(col1, col2, f)⬜️
max(col)
md5(col)⬜️
mean(col)⬜️
min(col)
minute(col)⬜️
monotonically_increasing_id()⬜️
month(col)⬜️
months(col)⬜️
months_between(date1, date2[, roundOff])⬜️
nanvl(col1, col2)⬜️
next_day(date, dayOfWeek)⬜️
nth_value(col, offset[, ignoreNulls])⬜️
ntile(n)⬜️
overlay(src, replace, pos[, len])⬜️
pandas_udf([f, returnType, functionType])⬜️
percent_rank()⬜️
percentile_approx(col, percentage[, accuracy])⬜️
posexplode(col)⬜️
posexplode_outer(col)⬜️
pow(col1, col2)⬜️
quarter(col)⬜️
radians(col)⬜️
raise_error(errMsg)⬜️
rand([seed])⬜️
randn([seed])⬜️
rank()⬜️
regexp_extract(str, pattern, idx)⬜️
regexp_replace(str, pattern, replacement)⬜️
repeat(col, n)⬜️
reverse(col)⬜️
rint(col)⬜️
round(col[, scale])⬜️
row_number()⬜️
rpad(col, len, pad)⬜️
rtrim(col)⬜️
schema_of_csv(csv[, options])⬜️
schema_of_json(json[, options])⬜️
second(col)⬜️
sequence(start, stop[, step])⬜️
sha1(col)⬜️
sha2(col, numBits)⬜️
shiftLeft(col, numBits)⬜️
shiftRight(col, numBits)⬜️
shiftRightUnsigned(col, numBits)⬜️
shuffle(col)⬜️
signum(col)⬜️
sin(col)⬜️
sinh(col)⬜️
size(col)⬜️
skewness(col)⬜️
slice(x, start, length)⬜️
sort_array(col[, asc])⬜️
soundex(col)⬜️
spark_partition_id()⬜️
split(str, pattern[, limit])⬜️
sqrt(col)⬜️
stddev(col)⬜️
stddev_pop(col)⬜️
stddev_samp(col)⬜️
struct(*cols)⬜️
substring(str, pos, len)
substring_index(str, delim, count)⬜️
sum(col)
sumDistinct(col)⬜️
tan(col)⬜️
tanh(col)⬜️
timestamp_seconds(col)⬜️
toDegrees(col)⬜️
toRadians(col)⬜️
to_csv(col[, options])⬜️
to_date(col[, format])⬜️
to_json(col[, options])⬜️
to_timestamp(col[, format])⬜️
to_utc_timestamp(timestamp, tz)⬜️
transform(col, f)⬜️
transform_keys(col, f)⬜️
transform_values(col, f)⬜️
translate(srcCol, matching, replace)⬜️
trim(col)⬜️
trunc(date, format)⬜️
udf([f, returnType])⬜️
unbase64(col)⬜️
unhex(col)⬜️
unix_timestamp([timestamp, format])⬜️
upper(col)⬜️
var_pop(col)⬜️
var_samp(col)⬜️
variance(col)⬜️
weekofyear(col)⬜️
when(condition, value)
window(timeColumn, windowDuration[, …])⬜️
xxhash64(*cols)⬜️
year(col)⬜️
years(col)⬜️
zip_with(left, right, f)⬜️
from_avro(data, jsonFormatSchema[, options])⬜️
to_avro(data[, jsonFormatSchema])⬜️

Window

FunctionsSupported ✅?
Window.currentRow⬜️
Window.orderBy(*cols)⬜️
Window.partitionBy(*cols)⬜️
Window.rangeBetween(start, end)⬜️
Window.rowsBetween(start, end)⬜️
Window.unboundedFollowing⬜️
Window.unboundedPreceding⬜️
WindowSpec.orderBy(*cols)⬜️
WindowSpec.partitionBy(*cols)⬜️
WindowSpec.rangeBetween(start, end)⬜️
WindowSpec.rowsBetween(start, end)⬜️

Grouping

FunctionsSupported ✅?
GroupedData.agg(*exprs)
GroupedData.apply(udf)⬜️
GroupedData.applyInPandas(func, schema)⬜️
GroupedData.avg(*cols)
GroupedData.cogroup(other)⬜️
GroupedData.count()
GroupedData.max(*cols)⬜️
GroupedData.mean(*cols)⬜️
GroupedData.min(*cols)⬜️
GroupedData.pivot(pivot_col[, values])⬜️
GroupedData.sum(*cols)
PandasCogroupedOps.applyInPandas(func, schema)⬜️