Dataframe Operators
Dateframe operators for both pd.dataframe and pyspark.dataframe
Dataframe operator cheat sheet for pd.dataframe and pyspark.dataframe
sort
pd.dataframe
df.sort_values("col")
pyspark.dataframe
df.sort(desc(col))
tocsv
pd.dataframe
df.to_csv(file_name)
pyspark.dataframe
df.repartition(1).write.mode('overwrite').option("header","true").csv(file_name)
tojson
pd.dataframe
df.to_json(file_name,orient='records',lines=True)
pyspark.dataframe
df.repartition(1).write.mode('overwrite').option("header","true").json(file_name)
column_rename
pd.dataframe
df.rename('old','new')
pyspark.dataframe
df.withColumnRenamed('old','new')
readcsv
pd.dataframe
pd.read_csv('file',sep=',')
pyspark.dataframe
spark.read.csv('file',sep=',')
readjson
pd.dataframe
pd.read_json('file',lines=True)
pyspark.dataframe
spark.read.json('file',lines=True)
num_row
pd.dataframe
df.shape[0]
pyspark.dataframe
df.count()
num_col
pd.dataframe
df.shape[1]
pyspark.dataframe
len(df.columns)
string not contain
pd.dataframe
df[~df['col'].contains('pattern')]
pyspark.dataframe
df.filter("col not like '%pattern%'")
not in
pd.dataframe
df[df['col'].isin([a,b,c])]
pyspark.dataframe
from pyspark.sql.function import col; df.filter(~col('bar').isin(['a','b']))
new col
pd.dataframe
df['new_col']= 'abc'
pyspark.dataframe
from pyspark.sql.function import lit; df.withColumn('new_col',lit('abc'))
multi select
pd.dataframe
df[['a','b']]
pyspark.dataframe
df[['a','b']]
distinct count of a column
pd.dataframe
df['col'].nunique()
pyspark.dataframe
df.select('col').distinct()
set ops-subtract
pyspark.dataframe
df1.select("sentence").subtract(df2.select("sentence")).distinct()
set ops-intersect
pyspark.dataframe
df1.select("sentence").intersect(df2.select("sentence")).distinct()
set ops-union
pyspark.dataframe
df1.select("sentence").union(df2.select("sentence")).distinct()
str_bool to int
pd.dataframe
(df['col'] == 'TRUE').astype('i2')
pyspark.dataframe
(df['col'] == 'TRUE').astype(int)
time difference
pyspark.dataframe
spark.sql('SELECT datediff(all.timestamp2,all.timestamp1)*24*60*60 + (hour(all.timestamp2)- hour(all.timestamp1))*60*60 + (minute(all.timestamp2) - minute(all.timestamp1))*60 + (second(all.timestamp2) - second(all.timestamp1)) as tol_time_diff from df')
groupby+sum(str)
pd.dataframe
def test_sum(series):
l = series.values.tolist()
new_l = [en for en in l if en!='']
return ';'.join(new_l)
target = x.groupby('uid').agg({'str':test_sum})
column to list
pd.dataframe
df['col'].tolist()
Written on August 30, 2019