agg
|
|
agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame agg(expr: Column, exprs: Column*): DataFrame agg(exprs: Map[String, String]): DataFrame |
An untyped transformation
|
alias
|
|
alias(alias: String): Dataset[T] alias(alias: Symbol): Dataset[T] |
A typed transformation that is a mere synonym of as.
|
apply
|
|
apply(colName: String): Column |
An untyped transformation to select a column based on the column name (i.e. maps a Dataset onto a Column )
|
as
|
|
as(alias: String): Dataset[T] as(alias: Symbol): Dataset[T] |
|
as
|
|
as[U : Encoder]: Dataset[U] |
A typed transformation to enforce a type, i.e. marking the records in the Dataset as of a given data type (data type conversion). as simply changes the view of the data that is passed into typed operations (e.g. map) and does not eagerly project away any columns that are not present in the specified class.
|
cache
|
A basic action that is a mere synonym of persist.
|
checkpoint
|
|
checkpoint(): Dataset[T] checkpoint(eager: Boolean): Dataset[T] |
A basic action to checkpoint the Dataset in a reliable way (using a reliable HDFS-compliant file system, e.g. Hadoop HDFS or Amazon S3)
|
coalesce
|
|
coalesce(numPartitions: Int): Dataset[T] |
A typed transformation to repartition a Dataset
|
col
|
|
col(colName: String): Column |
An untyped transformation to create a column (reference) based on the column name
|
collect
|
|
colRegex
|
|
colRegex(colName: String): Column |
An untyped transformation to create a column (reference) based on the column name specified as a regex
|
columns
|
|
count
|
An action to count the number of rows
|
createGlobalTempView
|
|
createGlobalTempView(viewName: String): Unit |
|
createOrReplaceGlobalTempView
|
|
createOrReplaceGlobalTempView(viewName: String): Unit |
|
createOrReplaceTempView
|
|
createOrReplaceTempView(viewName: String): Unit |
|
createTempView
|
|
createTempView(viewName: String): Unit |
|
crossJoin
|
|
crossJoin(right: Dataset[_]): DataFrame |
An untyped transformation
|
cube
|
|
cube(cols: Column*): RelationalGroupedDataset cube(col1: String, cols: String*): RelationalGroupedDataset |
An untyped transformation
|
describe
|
|
describe(cols: String*): DataFrame |
|
distinct
|
A typed transformation that is a mere synonym of dropDuplicates (with all the columns of the Dataset )
|
drop
|
|
drop(colName: String): DataFrame drop(colNames: String*): DataFrame drop(col: Column): DataFrame |
An untyped transformation
|
dropDuplicates
|
|
dropDuplicates(): Dataset[T] dropDuplicates(colNames: Array[String]): Dataset[T] dropDuplicates(colNames: Seq[String]): Dataset[T] dropDuplicates(col1: String, cols: String*): Dataset[T] |
|
dtypes
|
|
dtypes: Array[(String, String)] |
|
except
|
|
except(other: Dataset[T]): Dataset[T] |
|
exceptAll
|
|
exceptAll(other: Dataset[T]): Dataset[T] |
(New in 2.4.0) A typed transformation
|
explain
|
|
explain(): Unit explain(extended: Boolean): Unit |
A basic action to display the logical and physical plans of the Dataset , i.e. displays the logical and physical plans (with optional cost and codegen summaries) to the standard output
|
filter
|
|
filter(condition: Column): Dataset[T] filter(conditionExpr: String): Dataset[T] filter(func: T => Boolean): Dataset[T] |
|
first
|
An action that is a mere synonym of head
|
flatMap
|
|
flatMap[U : Encoder](func: T => TraversableOnce[U]): Dataset[U] |
|
foreach
|
|
foreach(f: T => Unit): Unit |
|
foreachPartition
|
|
foreachPartition(f: Iterator[T] => Unit): Unit |
|
groupBy
|
|
groupBy(cols: Column*): RelationalGroupedDataset groupBy(col1: String, cols: String*): RelationalGroupedDataset |
An untyped transformation
|
groupByKey
|
|
groupByKey[K: Encoder](func: T => K): KeyValueGroupedDataset[K, T] |
|
head
|
|
head(): T (1) head(n: Int): Array[T] |
|
hint
|
|
hint(name: String, parameters: Any*): Dataset[T] |
A basic action to specify a hint (and optional parameters)
|
inputFiles
|
|
inputFiles: Array[String] |
|
intersect
|
|
intersect(other: Dataset[T]): Dataset[T] |
|
intersectAll
|
|
intersectAll(other: Dataset[T]): Dataset[T] |
(New in 2.4.0) A typed transformation
|
isEmpty
|
(New in 2.4.0) A basic action
|
isLocal
|
|
isStreaming
|
|
join
|
|
join(right: Dataset[_]): DataFrame join(right: Dataset[_], usingColumn: String): DataFrame join(right: Dataset[_], usingColumns: Seq[String]): DataFrame join(right: Dataset[_], usingColumns: Seq[String], joinType: String): DataFrame join(right: Dataset[_], joinExprs: Column): DataFrame join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame |
An untyped transformation
|
joinWith
|
|
joinWith[U](other: Dataset[U], condition: Column): Dataset[(T, U)] joinWith[U](other: Dataset[U], condition: Column, joinType: String): Dataset[(T, U)] |
|
limit
|
|
limit(n: Int): Dataset[T] |
|
localCheckpoint
|
|
localCheckpoint(): Dataset[T] localCheckpoint(eager: Boolean): Dataset[T] |
A basic action to checkpoint the Dataset locally on executors (and therefore unreliably)
|
map
|
|
map[U: Encoder](func: T => U): Dataset[U] |
|
mapPartitions
|
|
mapPartitions[U : Encoder](func: Iterator[T] => Iterator[U]): Dataset[U] |
|
na
|
An untyped transformation
|
orderBy
|
|
orderBy(sortExprs: Column*): Dataset[T] orderBy(sortCol: String, sortCols: String*): Dataset[T] |
|
persist
|
|
persist(): this.type persist(newLevel: StorageLevel): this.type |
A basic action to persist the Dataset
|
printSchema
|
|
randomSplit
|
|
randomSplit(weights: Array[Double]): Array[Dataset[T]] randomSplit(weights: Array[Double], seed: Long): Array[Dataset[T]] |
A typed transformation to split a Dataset randomly into two Datasets
|
rdd
|
|
reduce
|
|
reduce(func: (T, T) => T): T |
An action to reduce the records of the Dataset using the specified binary function.
|
repartition
|
|
repartition(partitionExprs: Column*): Dataset[T] repartition(numPartitions: Int): Dataset[T] repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T] |
A typed transformation to repartition a Dataset
|
repartitionByRange
|
|
repartitionByRange(partitionExprs: Column*): Dataset[T] repartitionByRange(numPartitions: Int, partitionExprs: Column*): Dataset[T] |
|
rollup
|
|
rollup(cols: Column*): RelationalGroupedDataset rollup(col1: String, cols: String*): RelationalGroupedDataset |
An untyped transformation
|
sample
|
|
sample(withReplacement: Boolean, fraction: Double): Dataset[T] sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T] sample(fraction: Double): Dataset[T] sample(fraction: Double, seed: Long): Dataset[T] |
|
schema
|
|
select
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
|
select(cols: Column*): DataFrame select(col: String, cols: String*): DataFrame select[U1](c1: TypedColumn[T, U1]): Dataset[U1] select[U1, U2](c1: TypedColumn[T, U1], c2: TypedColumn[T, U2]): Dataset[(U1, U2)] select[U1, U2, U3]( c1: TypedColumn[T, U1], c2: TypedColumn[T, U2], c3: TypedColumn[T, U3]): Dataset[(U1, U2, U3)] select[U1, U2, U3, U4]( c1: TypedColumn[T, U1], c2: TypedColumn[T, U2], c3: TypedColumn[T, U3], c4: TypedColumn[T, U4]): Dataset[(U1, U2, U3, U4)] select[U1, U2, U3, U4, U5]( c1: TypedColumn[T, U1], c2: TypedColumn[T, U2], c3: TypedColumn[T, U3], c4: TypedColumn[T, U4], c5: TypedColumn[T, U5]): Dataset[(U1, U2, U3, U4, U5)] |
An (untyped and typed) transformation
|
selectExpr
|
|
selectExpr(exprs: String*): DataFrame |
An untyped transformation
|
show
|
|
show(): Unit show(truncate: Boolean): Unit show(numRows: Int): Unit show(numRows: Int, truncate: Boolean): Unit show(numRows: Int, truncate: Int): Unit show(numRows: Int, truncate: Int, vertical: Boolean): Unit |
|
sort
|
|
sort(sortExprs: Column*): Dataset[T] sort(sortCol: String, sortCols: String*): Dataset[T] |
A typed transformation to sort elements globally (across partitions). Use sortWithinPartitions transformation for partition-local sort
|
sortWithinPartitions
|
|
sortWithinPartitions(sortExprs: Column*): Dataset[T] sortWithinPartitions(sortCol: String, sortCols: String*): Dataset[T] |
A typed transformation to sort elements within partitions (aka local sort). Use sort transformation for global sort (across partitions)
|
stat
|
|
stat: DataFrameStatFunctions |
An untyped transformation
|
storageLevel
|
|
storageLevel: StorageLevel |
|
summary
|
|
summary(statistics: String*): DataFrame |
An action to calculate statistics (e.g. count , mean , stddev , min , max and 25% , 50% , 75% percentiles)
|
take
|
An action to take the first records of a Dataset
|
toDF
|
|
toDF(): DataFrame toDF(colNames: String*): DataFrame |
A basic action to convert a Dataset to a DataFrame
|
toJSON
|
|
toLocalIterator
|
|
toLocalIterator(): java.util.Iterator[T] |
An action that returns an iterator with all rows in the Dataset . The iterator will consume as much memory as the largest partition in the Dataset .
|
transform
|
|
transform[U](t: Dataset[T] => Dataset[U]): Dataset[U] |
A typed transformation for chaining custom transformations
|
union
|
|
union(other: Dataset[T]): Dataset[T] |
|
unionByName
|
|
unionByName(other: Dataset[T]): Dataset[T] |
|
unpersist
|
|
unpersist(): this.type (1) unpersist(blocking: Boolean): this.type |
-
Uses unpersist with blocking disabled (false )
A basic action to unpersist the Dataset
|
where
|
|
where(condition: Column): Dataset[T] where(conditionExpr: String): Dataset[T] |
|
withColumn
|
|
withColumn(colName: String, col: Column): DataFrame |
An untyped transformation
|
withColumnRenamed
|
|
withColumnRenamed(existingName: String, newName: String): DataFrame |
An untyped transformation
|
write
|
|
write: DataFrameWriter[T] |
A basic action that returns a DataFrameWriter for saving the content of the (non-streaming) Dataset out to an external storage
|