PySpark - CSV to Parquet Column Header (Special Character “/”) error

Refresh

March 2019

Views

3 time

0

I am having problem running this - it runs fine for Track Number but Transaction/Time has a special Character and it is failing - how to handle this.

WHat i am looking for is either replace the row and define my own header. Or Remove special characters from the Header row.

Thanks!!

from pyspark.sql import SparkSession import traceback from pyspark.sql.functions import * import csv import os

def create_parquet(): try:

            spark = SparkSession.builder.appName("prc_conversiontoparquet").getOrCreate()

            df = spark.read.option("header", True).option("delimiter", ",").option("multiLine", "true").\
                csv("s3://ert-opp-uw2-external-data-dev/sftp_data/merchant_1.csv")
            df.createOrReplaceTempView("input")

            query = """                                                                                            
                      SELECT                                        
                            string(`Transaction/Time`) as Transaction_Time 
                            string('Track Count') as Track_Count
                      FROM input                           
                    """

            print(query)

            result = spark.sql(query)
            result.repartition(100).write.mode('overwrite').parquet("s3://ert-opp-uw2-external-data-dev/sftp_data/parquet_files/")

    except Exception as e:
            print(traceback.format_exc())
            print("Error Occurred")
            print(e)

create_parquet()

0 answers