Dataframe Operators

Dateframe operators for both pd.dataframe and pyspark.dataframe

Dataframe operator cheat sheet for pd.dataframe and pyspark.dataframe

sort

pd.dataframe

df.sort_values("col")

pyspark.dataframe

df.sort(desc(col))

tocsv

pd.dataframe

df.to_csv(file_name)

pyspark.dataframe

df.repartition(1).write.mode('overwrite').option("header","true").csv(file_name)

tojson

pd.dataframe

df.to_json(file_name,orient='records',lines=True)

pyspark.dataframe

df.repartition(1).write.mode('overwrite').option("header","true").json(file_name)

column_rename

pd.dataframe

df.rename('old','new')

pyspark.dataframe

df.withColumnRenamed('old','new')

readcsv

pd.dataframe

pd.read_csv('file',sep=',')

pyspark.dataframe

spark.read.csv('file',sep=',')

readjson

pd.dataframe

pd.read_json('file',lines=True)

pyspark.dataframe

spark.read.json('file',lines=True)

num_row

pd.dataframe

df.shape[0]

pyspark.dataframe

df.count()

num_col

pd.dataframe

df.shape[1]

pyspark.dataframe

len(df.columns)

string not contain

pd.dataframe

df[~df['col'].contains('pattern')]

pyspark.dataframe

df.filter("col not like '%pattern%'")

not in

pd.dataframe

df[df['col'].isin([a,b,c])]

pyspark.dataframe

from pyspark.sql.function import col; df.filter(~col('bar').isin(['a','b']))

new col

pd.dataframe

df['new_col']= 'abc'

pyspark.dataframe

from pyspark.sql.function import lit; df.withColumn('new_col',lit('abc'))

multi select

pd.dataframe

df[['a','b']] 

pyspark.dataframe

df[['a','b']]

distinct count of a column

pd.dataframe

df['col'].nunique()

pyspark.dataframe

df.select('col').distinct()

set ops-subtract

pyspark.dataframe

df1.select("sentence").subtract(df2.select("sentence")).distinct()

set ops-intersect

pyspark.dataframe

df1.select("sentence").intersect(df2.select("sentence")).distinct()

set ops-union

pyspark.dataframe

df1.select("sentence").union(df2.select("sentence")).distinct()

str_bool to int

pd.dataframe

(df['col'] == 'TRUE').astype('i2')

pyspark.dataframe

(df['col'] == 'TRUE').astype(int)

time difference

pyspark.dataframe

spark.sql('SELECT datediff(all.timestamp2,all.timestamp1)*24*60*60 + (hour(all.timestamp2)- hour(all.timestamp1))*60*60 + (minute(all.timestamp2) - minute(all.timestamp1))*60 + (second(all.timestamp2) - second(all.timestamp1)) as tol_time_diff from df')

groupby+sum(str)

pd.dataframe

def test_sum(series):
	l = series.values.tolist()
	new_l = [en for en in l if en!='']
	return ';'.join(new_l)
target = x.groupby('uid').agg({'str':test_sum})

column to list

pd.dataframe

df['col'].tolist()
Written on August 30, 2019