Feature engineering is a difficult and time consuming process. ML Featurizer is a library to enable users to create additional features from raw data with ease. It extends and enriches the existing Spark's feature engineering functionality.
- Unary Temporal Featurizers
- DayOfWeekFeaturizer
- HourOfDayFeaturizer
- MonthOfYearFeaturizer
- PartsOfDayFeaturizer
- WeekendFeaturizer
- Unary Numeric Featurizers
- LogTransformFeaturizer
- MathFeaturizer
- PowerTransformFeaturizer
- Binary Temporal Featurizers
- DateDiffFeaturizer
- Binary Numeric Featurizers
- AdditionFeaturizer
- DivisionFeaturizer
- MultiplicationFeaturizer
- SubtractionFeaturizer
- Binary String Featurizers
- ConcateColumnsFeaturizer
- Grouping Featurizers
- GroupByFeaturizer (count, ratio, min, max, count, avg, sum)
- GEO Featurizers
- GeohashFeaturizer (convert latitude and longitude into geohash)
object DayOfWeekFeaturizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("DayOfWeekFeaturizer").master("local").getOrCreate()
val data = Array((0, "2018-01-02"),
(1, "2018-02-02"),
(2, "2018-03-02"),
(3, "2018-04-05"),
(3, "2018-05-05"))
val dataFrame = spark.createDataFrame(data).toDF("id", "date")
val featurizer = new DayOfWeekFeaturizer()
.setInputCol("date")
.setOutputCol("dayOfWeek")
.setFormat("yyyy-MM-dd")
val featurizedDataFrame = featurizer.transform(dataFrame)
featurizedDataFrame.show()
}
}
object FeaturePipeline {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("FeaturePipeline").master("local").getOrCreate()
val data = Array((0, "2018-01-02", 1.0, 2.0, "mercedes"),
(1, "2018-02-02", 2.5, 3.5, "lexus"),
(2, "2018-03-02", 5.0, 1.0, "toyota"),
(3, "2018-04-05", 8.0, 9.0, "tesla"),
(4, "2018-05-05", 1.0, 5.0, "bmw"),
(4, "2018-05-05", 1.0, 5.0, "bmw"))
val dataFrame = spark.createDataFrame(data).toDF("id", "date", "price1", "price2", "brand")
val dayOfWeekfeaturizer = new DayOfWeekFeaturizer()
.setInputCol("date")
.setOutputCol("dayOfWeek")
.setFormat("yyyy-MM-dd")
val monthOfYearfeaturizer = new MonthOfYearFeaturizer()
.setInputCol("date")
.setOutputCol("monthOfYear")
.setFormat("yyyy-MM-dd")
val weekendFeaturizer = new WeekendFeaturizer()
.setInputCol("date")
.setOutputCol("isWeekend")
.setFormat("yyyy-MM-dd")
val additionFeaturizer = new AdditionFeaturizer()
.setInputCols("price1", "price2")
.setOutputCol("price1_add_price2")
val indexer = new StringIndexer()
.setInputCol("brand")
.setOutputCol("brandIndex")
val encoder = new OneHotEncoder()
.setInputCol("brandIndex")
.setOutputCol("brandVector")
val pipeline = new Pipeline()
.setStages(Array(dayOfWeekfeaturizer, monthOfYearfeaturizer, weekendFeaturizer, additionFeaturizer,
indexer, encoder))
val model = pipeline.fit(dataFrame)
model.transform(dataFrame).show()
}
}If you're interested in contributing to this project, check out our contribution guidelines!