`

spark datasource

阅读更多
DataFrameWriter

format


val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)


private var source: String = df.sparkSession.sessionState.conf.defaultDataSourceName

根据这个标来找对应的


val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
if (classOf[DataSourceV2].isAssignableFrom(cls)) {
  val ds = cls.newInstance()
  ds match {
    case ws: WriteSupport =>
      val options = new DataSourceOptions((extraOptions ++
        DataSourceV2Utils.extractSessionConfigs(
          ds = ds.asInstanceOf[DataSourceV2],
          conf = df.sparkSession.sessionState.conf)).asJava)
      // Using a timestamp and a random UUID to distinguish different writing jobs. This is good
      // enough as there won't be tons of writing jobs created at the same second.
      val jobId = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
        .format(new Date()) + "-" + UUID.randomUUID()
      val writer = ws.createWriter(jobId, df.logicalPlan.schema, mode, options)
      if (writer.isPresent) {
        runCommand(df.sparkSession, "save") {
          WriteToDataSourceV2(writer.get(), df.logicalPlan)
        }
      }



2018-03-14 09:39:19,706 WARN  [Executor task launch worker for task 175] parquet.CorruptStatistics: Ignoring statistics because created_by could not be parsed (see PARQUET-251): parquet-mr version 1.6.0
org.apache.parquet.VersionParser$VersionParseException: Could not parse created_by: parquet-mr version 1.6.0 using format: (.+) version ((.*) )?\(build ?(.*)\)
        at org.apache.parquet.VersionParser.parse(VersionParser.java:112)
        at org.apache.parquet.CorruptStatistics.shouldIgnoreStatistics(CorruptStatistics.java:60)
        at org.apache.parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics(ParquetMetadataConverter.java:263)
        at org.apache.parquet.format.converter.ParquetMetadataConverter.fromParquetMetadata(ParquetMetadataConverter.java:567)
        at org.apache.parquet.format.converter.ParquetMetadataConverter.readParquetMetadata(ParquetMetadataConverter.java:544)
        at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:431)
        at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:386)
        at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:107)
        at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109)
        at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:381)
        at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:355)
        at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:168)
        at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown Source)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
        at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
        at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)



parquet

ParquetFileFormat.scala
package org.apache.spark.sql.execution.datasources










分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics