1 year ago

#387661

test-img

Ajay Makkar

JSON string to dataframe "change schema" schema contain Ambiguous column - Spark Scala

How to convert this JSON to Dataframe in scala

json_string = """{
    "module": {
        "col1": "a",
        "col2": {
            "5": 1,
            "3": 4,
            "4": {
                "numeric reasoning": 2,
                "verbal": 4
            },
            "7": {
                "landline": 2,
                "landLine": 4
            }
        }
    }
}"""

Function I use -

 val jsonRDD = spark.parallelize(json_string::Nil)
 val jsonDF = sqlContext.read.json(jsonRDD)
 val df = flattenRecursive(jsonDF)
 df.show()


def flattenRecursive(df: DataFrame): DataFrame = {

    val fields = df.schema.fields
    val fieldNames = fields.map(x => x.name)
    val length = fields.length
    
    for(i <- 0 to fields.length-1){
      val field = fields(i)
      val fieldtype = field.dataType
      val fieldName = field.name
      fieldtype match {
        case arrayType: ArrayType => 
           println("flatten array")
          val newfieldNames = fieldNames.filter(_!=fieldName) ++ Array("explode_outer(".concat(fieldName).concat(") as ").concat(fieldName))
          val explodedDf = df.selectExpr(newfieldNames:_*)
          return flattenRecursive(explodedDf)
        case structType: StructType => 
          println("flatten struct")
          val newfieldNames = fieldNames.filter(_!= fieldName) ++ structType.fieldNames.map(childname => fieldName.concat(".").concat(childname)                                                                                                        .concat(" as ").concat(fieldName).concat("_").concat(childname))
                val explodedf = df.selectExpr(newfieldNames:_*)
              return flattenRecursive(explodedf)
            case _ => 
              println("other type")
          }
        }
        df
      }

Error I face is -

Ambiguous reference to fields StructField(landLine,LongType,true), StructField(landline,LongType,true);

Required output - if we can edit 1 landline column to landline_1 before explode

**Note - please provide the generic code because

  1. I don't know on which level I face this ambiguity and also
  2. I don't know the schema while running the code**

json

dataframe

scala

apache-spark

schema

0 Answers

Your Answer

Accepted video resources