Pandas

Create Dataframe


# Create a simple Pandas DataFrame
df = pd.DataFrame([[1,'A',1.23,10],[2,'B',2.34,9],[3,np.nan,float('NaN'),None],[4,'D',4.45,7]], columns = ['idx','cat','float','int'])


import pandas as pd
import numpy as np
df = pd.DataFrame({'cat': ['A', 'B', np.nan, 'D'], 'float': [1.23, 2.34, float('Nan'), 4.45], 'int': [10, 9, None, 7]})
#df.reset_index(inplace=True)    # How to create integer index if needed (not needed in this instance because Pandas creates it automatically)
print(df)
#   cat  float   int
#0    A   1.23  10.0
#1    B   2.34   9.0
#2  NaN    NaN   NaN
#3    D   4.45   7.0

# Check the index
if df.index.empty: raise Exception("WARNING:  Dataframe index is empty")
if not df.index.is_unique: raise Exception("WARNING:  Dataframe index is NOT considered unique")
print("df.first_valid_index(): ", df.first_valid_index())



# Create a Pandas dataframe from a list of Unix datetime values

import pandas as pd
import numpy as np

# List of Unix datetime values
unix_datetimes = [1702055700, 1702054318, 1702054250, 1702054080, 1702050060, 1702048116, 1702046440, 1702044611]
# Convert to Numpy array as data type datetime64[s] (compatible with Pandas)
datetime64 = np.array([unix_datetimes]).astype('datetime64[s]')
#print(datetime64[0])    # ['2023-12-08T17:15:00' '2023-12-08T16:51:58' '2023-12-08T16:50:50' '2023-12-08T16:48:00' '2023-12-08T15:41:00' '2023-12-08T15:08:36'  '2023-12-08T14:40:40' '2023-12-08T14:10:11']
headlines = ['Microsoft-OpenAI Partnership Draws Scrutiny From U.K. Regulator', 'US FTC examining Microsoft investment in OpenAI - Bloomberg News', "Microsoft, OpenAI Partnership Draws Scrutiny of UK's Competition Watchdog", 'Microsoft may face UK, FTC probes into OpenAI partnership after taking board seat', 'Does Intel Stock Have Its Mojo Back? We Talk to the Bulls.', 'Microsoft, OpenAI Are Facing a Potential Antitrust Probe in UK', 'UK antitrust regulators eyeing Microsoft-OpenAI partnership', "Microsoft's role in OpenAI on EU antitrust regulators' radar"]
# Create the Pandas dataframe
df = pd.DataFrame({'datetime_utc': datetime64[0],
				   'headlines': headlines}
				   )    
# Set the index
df.set_index('datetime_utc', inplace=True) # Create index from column 'unix_datetimes'
print(df)
#                                                             headlines
#datetime_utc
#2023-12-08 17:15:00  Microsoft-OpenAI Partnership Draws Scrutiny Fr...
#2023-12-08 16:51:58  US FTC examining Microsoft investment in OpenA...
#...



# Create Pandas series
import numpy as np 
import pandas as pd 
from darts import TimeSeries

# Create numpy 1D array
x = np.array([0, 1, 2, 3], np.int64)
y = np.array([1.23, 2.34, 3.45, 4.56], np.float64)

ds = pd.Series(data=y, index=x)
print(ds)
#0    1.23
#1    2.34
#2    3.45
#3    4.56



# Derive a DateTime column from Unix timestamps in ms
import pandas as pd
import numpy as np

df = pd.DataFrame([[1528185612060, -0.0285],
				   [1528185612072, 0.0000],
				   [1528185612075, 0.0285],
				   [1528185612080, -0.0285],
				   [1528185612085, 0.0285]
				   ], 
				columns = ['unix_time[ms]', 'flow[SLM]'])
# 1528185612060 -> Tuesday, June 5, 2018 8:00:12.060 AM UTC
df['datetime_utc'] = pd.to_datetime(df['unix_time[ms]'],unit='ms')  # change unit to 's' if seconds
print(df)


 

Dataframe Index

Checking if a date exists in a datetime index


# Checking if a date exists in a datetime index

import pandas as pd
import numpy as np

ds = pd.Series(pd.date_range("2000-01-01", periods=4, freq="D"))
df = pd.DataFrame([[1.23, 4.45, 10],
				   [3.38, 3.34, 9],
				   [2.34, 2.34, 8],
				   [4.45, 1.23, 7]], 
				columns = ['float_rnd', 'float_lin', 'int'])
df.set_index(ds, inplace=True)
df.index.rename("Timestamp", inplace=True)
print(df)
#            float_rnd  float_lin  int
#Timestamp
#2000-01-01       1.23       4.45   10
#2000-01-02       3.38       3.34    9
#2000-01-03       2.34       2.34    8
#2000-01-04       4.45       1.23    7


date_day = "2000-01-02"

# This is the preferred method that works with a datetime index that includes or doesn't have time precision.
# Note that reading a dataframe from a Parquet file with datetime index will have an index with time precision
# that returns incorect results for a test using:
#   df.index.isin([date_day]).any()
#   df.index.isin(np.array([pd.Timestamp(date_day)]).astype('datetime64[ns]')).any()
#   if pd.to_datetime(date_day) in df.index: print("True")
if date_day in df.index: print("date_day in df.index")     

Removing Precision From The Datetime Index


# The datetime64 index in the Parquet file may have time preision 
# that influences some tests if a value exists in the index.
# Remove that time precision (but only if no gaps exist!). 
# Create a new column 'dte' based on the datetime index
df['dte'] = df.index.values.astype('datetime64[s]') 
# Strip any hourly precision from column 'dte'.   2023-11-20 00:00:00-05:00 -> 2023-11-20
df['dte'] = df['dte'].dt.floor('D')
# Assign the revised column 'dte' as the index
df.set_index('dte', inplace=True)
# Note that the index freq=None if it has gaps
#print(df.index)     # '2023-11-02', ... '2023-12-08'], dtype='datetime64[s]', name='dte', freq=None)

 

Conversions


import numpy as np 
import pandas as pd 
from darts import TimeSeries

# Create numpy 1D array
x = np.array([0, 1, 2, 3], np.int64)
y = np.array([1.23, 2.34, 3.45, 4.56], np.float64)

# Create Pandas dataframe
df = pd.DataFrame({'x': x, 'y':y})
df.set_index('x', inplace=True, drop=True)
print(df)
#      y
#x
#0  1.23
#1  2.34
#2  3.45
#3  4.56

# Create Pandas series
ds = pd.Series(data=y, index=x)
print(ds)
#0    1.23
#1    2.34
#2    3.45
#3    4.56

# Convert Pandas dataframe to numpy 1d arrays
x = df.index.values
y = df['y'].values
print(x, y)
# [0 1 2 3] [1.23 2.34 3.45 4.56]

# Convert Pandas dataframe to numpy 1d arrays
x = df.index.to_numpy()
y = df.iloc[:,0].values     # Get first column in df using .iloc
print(x, y)
# [0 1 2 3] [1.23 2.34 3.45 4.56]

# Convert Pandas dataframe to numpy 1d arrays
x = df.index.to_numpy()
y = df.iloc[:,0].to_numpy()
print(x, y)
# [0 1 2 3] [1.23 2.34 3.45 4.56]

# Convert Pandas series to 1d arrays
x = ds.index.values
y = ds.values
print(x, y)
# [0 1 2 3] [1.23 2.34 3.45 4.56]

# Convert Pandas series to 1d arrays
x = ds.index.to_numpy()
y = ds.to_numpy()
print(x, y)
# [0 1 2 3] [1.23 2.34 3.45 4.56]

# Convert Pandas dataframe to Darts timeseries
ts = TimeSeries.from_series(pd_series=df['y']) 
print(ts)
#
#array([[[1.23]],
#
#       [[2.34]],
#
#       [[3.45]],
#
#       [[4.56]]])

# Convert Pandas dataframe to Darts timeseries
ts = TimeSeries.from_series(pd_series=df.iloc[:,0]) 
print(ts)
#
#array([[[1.23]],
#
#       [[2.34]],
#
#       [[3.45]],
#
#       [[4.56]]])

# Convert Pandas series to Darts timeseries
ts = TimeSeries.from_series(pd_series=ds)
print(ts)
#
#array([[[1.23]],
#
#       [[2.34]],
#
#       [[3.45]],
#
#       [[4.56]]])


# Convert Darts timeseries to Pandas dataframe
del df, ds, x, y
df = ts.pd_dataframe()
print(df)
#time
#0          1.23
#1          2.34
#2          3.45
#3          4.56 

# Convert Darts timeseries to Pandas series
del df
ds = ts.pd_series()
print(ds)
#time
#0          1.23
#1          2.34
#2          3.45
#3          4.56 


# Convert all columns of np.float64 to np.float32 for data efficiency
df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)

 

Save/ Read Parquet File


from pathlib import Path
# pip install pandas
import pandas as pd

# Save the dataframe to a Parquet file
path_file = Path(Path.cwd().parent).joinpath('data/tbd.parquet')
print("Saving dataframe file to: ", path_file.name)
df.to_parquet(path=path_file, compression='gzip')

# Read the Parquet file to dataframe
path_file = Path(Path.cwd().parent).joinpath('data/tbd.parquet')
if not path_file.exists(): raise Exception("File doesn't exist: ", path_file)
print("Reading Parquet file: ", path_file.name)
df = pd.read_parquet(path=path_file)  

 

Iterate


import pandas as pd
import numpy as np
# Create a simple Pandas DataFrame
df = pd.DataFrame([[1,'A',1.23,10],[2,'B',2.34,9],[3,np.nan,float('NaN'),None],[4,'D',4.45,7]], columns = ['idx','cat','float','int'])
df.reset_index(inplace=True)    # create integer index


# .apply() is very slow
#df['float'] = df['float'].apply(lambda x: x*2)


# Change values using a lambda function
#df['float'] = df['float'].apply(lambda x: x * 2)  # Multiply all values by 2


for index, row in df.iterrows():
	print(index, "\t", row['cat'], "\t", row['float'])

0        A       1.23
1        B       2.34
2        nan     nan
3        D       4.45


# iterate over rows
for i in range(df.shape[0]):
	print(df.iloc[i, 0], "\t", df.iloc[i, 1], "\t", df.iloc[i, 2])

# COLUMNS

for colname in df.columns:
	ds = df[colname]
	print(colname, ds.name)		# prints column name twice


for idx, colname in enumerate(df.columns):
	ds = df.iloc[:,idx]
	print(idx, colname, ds.name)


 


# import necessary libraries
import pandas as pd
import numpy as np

# Create a simple Pandas DataFrame
df = pd.DataFrame([[1,'A',1.23,10],[2,'B',2.34,9],[3,np.nan,float('NaN'),None],[4,'D',4.45,7]], columns = ['idx','cat','float','int'])

print(df.head(), '\n')

# Change specific row values in a column
df.loc[2, 'int'] = 20  # Change the value at row 2 to 20
df.at[3, 'int'] = 25   # Change the value at row 3 to 25

print(df.head(), '\n')

 

Chain


Pivot


# import necessary libraries
import pandas as pd
import numpy as np

# Create a simple Pandas DataFrame
df = pd.DataFrame([[1,'A',1.23,10],[2,'B',2.34,9],[3,np.nan,float('NaN'),None],[4,'D',4.45,7]], columns = ['idx','cat','float','int'])

print(df.head(), '\n')

# Create a pivot table
pivot_table = df.pivot_table(values='float', index='idx', columns='cat')

print(pivot_table.head(), '\n')

Append

Append New Column to Dataframe


import pandas as pd
import numpy as np

ds = pd.Series(pd.date_range("2000-01-01", periods=4, freq="D"))
df = pd.DataFrame([[1.23, 4.45, 10],
				   [3.38, 3.34, 9],
				   [2.34, 2.34, 8],
				   [4.45, 1.23, 7]], 
				columns = ['float_rnd', 'float_lin', 'int'])
df.set_index(ds, inplace=True)
del ds
df.index.rename("Timestamp", inplace=True)
#print(df)
#            float_rnd  float_lin  int
#Timestamp
#2000-01-01       1.23       4.45   10
#2000-01-02       3.38       3.34    9
#2000-01-03       2.34       2.34    8
#2000-01-04       4.45       1.23    7

# Add a new column from a numpy array 

# Create a numpy array with new values for a column in df
new_col_vals = np.array([0, 1, 2, 3])

# Add the numpy array to a new column in df named 'new_col'
df = df.assign(new_col=new_col_vals)

print(df)
#            float_rnd  float_lin  int  new_col
#Timestamp
#2000-01-01       1.23       4.45   10        0
#2000-01-02       3.38       3.34    9        1
#2000-01-03       2.34       2.34    8        2
#2000-01-04       4.45       1.23    7        3


# OR add a new column from a Pandas series

# Create a numpy array with new values for a column in df
new_col_vals = np.array([0, 1, 2, 3])

# Create a Pandas series from the numpy array and the index in df
ds = pd.Series(new_col_vals, index=df.index)

# Add the series ds as a new column in df named 'new_col_ds'
df = df.assign(new_col_ds=ds)

print(df)
#            float_rnd  float_lin  int  new_col_ds
#Timestamp
#2000-01-01       1.23       4.45   10        0
#2000-01-02       3.38       3.34    9        1
#2000-01-03       2.34       2.34    8        2
#2000-01-04       4.45       1.23    7        3



Delete


# Drop rows with the value of NaT (datetime) or NaN (numeric)
df.dropna(inplace=True)


# Drop columns with missing data (NaN)
df.dropna(inplace=True, axis='columns')


# Drop specific columns from a DataFrame
df.drop(columns=['col1', 'col2'], inplace=True)

NaN NaT


# Drop rows with the value of NaT (datetime) or NaN (numeric)
df.dropna(inplace=True)


# Drop columns with missing data (NaN)
df.dropna(inplace=True, axis='columns')


# Print (only) the rows with NaN / NaT
print(df[df.isnull().any(axis=1)])


# Show the count of NaN/NaT by column
print(df.isnull().sum())

Remove NaN/NaT & Rebuild Index



Miscellaneous



# Subtract the first row value from every other row within a single column

# Get the value for the first row using iloc[0]
first_row_unix_ms = df['unix_ms'].iloc[0]
print("first_row_unix_ms:", df['unix_ms'].iloc[0])      # 1727271976000
print()

# Subtract the first row value in column 'unix_ms' from every value.
df['unix_ms'] = df['unix_ms'] - first_row_unix_ms
	
	

GroupBy

An aggregate function calculates a single statistical value such as mean, sum, min, max on a set of values.   Typically that set is extracted from a dataframe column based on a .groupby() applied to the values of another column.  


# pip install pandas
import pandas as pd

df = pd.DataFrame([['A',1,1,52],['A',2,1,56],['A',1,2,54],['A',2,2,58],
				   ['B',1,1,45],['B',2,1,60],['B',1,2,40],['B',2,2,55]], 
				   columns = ['Operator','Part','Trial','Value'])
#print(df)
#  Operator  Part  Trial  Value
#0        A     1      1     52
#1        A     2      1     56
#2        A     1      2     54
#3        A     2      2     58
#4        B     1      1     45
#5        B     2      1     60
#6        B     1      2     40
#7        B     2      2     55


df_grp = df.groupby('Operator').agg({'Value': ['sum','min','max']})
print(df_grp)
#         Value
#           sum min max
#Operator
#A          220  52  58
#B          200  40  60


df_grp = df.groupby(['Operator','Part']).agg({'Value': ['sum','min','max']})
print(df_grp)
#               Value        
#                 sum min max
# Operator Part
# A        1      106  52  54
#          2      114  56  58
# B        1       85  40  45
#          2      115  55  60


# Group the data by 'Operator' and 'Part' and calculate the sum, min, and max
df_grp = df.groupby(['Operator','Part']).agg({'Value': ['sum','min','max']})
print(df_grp)
#              Value        
#                sum min max
#Operator Part
#A        1      106  52  54
#         2      114  56  58
#B        1       85  40  45
#         2      115  55  60


# Note that df contains a MultiIndex for 'Value' with 'sum','min','max'.
# Multi-column names are passed as Tuples, like df[('parent', 'child')]
# Below shows how to access the MultiIndex column values and create a new column.
df_grp[('Value','rng')] = (df_grp[('Value','max')] - df_grp[('Value','min')]).abs()
print(df_grp)
#              Value
#                sum min max rng
#Operator Part
#A        1      106  52  54   2
#         2      114  56  58   2
#B        1       85  40  45   5
#         2      115  55  60   5


# You can also access a MultiIndex column as follows:
idx = pd.IndexSlice
df_min = df_grp.loc[:,idx[:,'min']]
print(df_min)
#              Value
#                min
#Operator Part
#A        1       52
#         2       56
#B        1       40
#         2       55


Related Links

https://pandas.pydata.org/pandas-docs/version/0.22/api.html#groupby

 

Change

Column Order


# Change a dataframe's column order
import pandas as pd
import numpy as np
df = pd.DataFrame({'cat': ['A', 'B', np.nan, 'D'], 'float': [1.23, 2.34, float('Nan'), 4.45], 'int': [10, 9, None, 7]})
#df.reset_index(inplace=True)    # create integer index
print(df)
#   cat  float   int
#0    A   1.23  10.0
#1    B   2.34   9.0
#2  NaN    NaN   NaN
#3    D   4.45   7.0


# Reorder the columns using the .iloc method
# Note that columns following the index are numbered beginning with zero.
# The first column below named 'int' is intentionally not moved.
#df = df.iloc[:,[0,2,1]]
#print(df)
#   cat   int  float
#0    A  10.0   1.23
#1    B   9.0   2.34
#2  NaN   NaN    NaN
#3    D   7.0   4.45


# Reorder the columns using the .loc method and a list of the column names
df = df.loc[:,['int','cat','float']]
print(df)
#    int  cat  float
#0  10.0    A   1.23
#1   9.0    B   2.34
#2   NaN  NaN    NaN
#3   7.0    D   4.45


# Note that the column names can be obtained by:
cols = list(df.columns)
print(cols)
# ['int', 'cat', 'float']

Time Series


Convert


import pandas as pd
import numpy as np
df = pd.DataFrame([[1,'A',1.23,10],[2,'B',2.34,9],[3,np.nan,float('NaN'),None],[4,'D',4.45,7]], columns = ['idx','cat','float','int'])

print(df.head(), '\n')
# display count of NAN in column 'float'
print("df['float'].isnull().sum(): ", df['float'].isnull().sum(), '\n')

# replace column with NaN with the value 0
df['float'] = df['float'].fillna(0)

# Replace NaN values for cloumn 'float' with the mean, median, or mode
#print("df['float'].mean() = ", df['float'].mean())
#print("df['float'].median() = ", df['float'].median())
#print("df['float'].mode() = ", df['float'].mode())
#df['float'].fillna(df['float'].median(), inplace=True)

# convert to data type integer
df['float'] = df['float'].astype(int)

# Show the data types and values for the dataframe
print(df.info(),'\n')
print(df.head(),'\n')

 


 

Categorical


Plot


Arithmetic


I/O


Create


DataFrame Info


df.shape
len(df)
df.head(n)
df.tail(n)
df.describe()
df.info()	# stats
df.mean(), df.median(), df.mode(), df.std()
df.dtypes	# data types
df.col1.unique()
df.col1.nunique()
df.index

Rename

Index


df.index.rename("Timestamp", inplace=True)

Columns


df.rename({"col1":"newcol1"}, axis=1, inplace=True, errors="raise")
df.rename(columns={df.columns[0]: 'ColA'}, inplace=True, errors="raise")	# Rename by column index 0,1,..
df.rename(lambda: x:x+"_col" axis=1)
df.add_suffix('_col')
df.add_prefix('col_')

# Create two dataframes with the same date index
import pandas as pd
import numpy as np
ds = pd.Series(pd.date_range("2000-01-01", periods=6, freq="D"))
df1 = pd.DataFrame(np.random.randn(6, 4),
				columns=['A', 'B', 'C', 'D'])
df1.set_index(ds, inplace=True)
df1.index.rename("Date", inplace=True)

df2 = pd.DataFrame(np.random.randn(6, 2),
				columns=['E', 'F'])
df2.set_index(ds, inplace=True)
df2.index.rename("Date", inplace=True)
print(df2)


# How to extract the first column from df2
# (columns are numbered from 0..)
ds = df2.iloc[:,0]


# Add the first column and column 'F' from df2 to df1
# If the index of each don't match, then NaN will be assigned to the added column. 
# Note that .concat() is an outer join by default
df1 = pd.concat([df1, df2.iloc[:,0], df2['F']], axis=1)  # The zero in iloc[:,0] refers to the first column
print(df1)

Sort


# Sort dataframe by column 'A' in decending order
df.sort_values(by=['A'], ascending=False, inplace=True)

# Sort dataframe by the first column in desending order
df.sort_values(by=df.columns[0], ascending=False, inplace=True)

Dataframe Subset


import pandas as pd
import numpy as np

df = pd.DataFrame([[1528185612080, -0.0125],
				   [1528185612085, 0.0000],
				   [1528185612090, 0.125],
				   [1528185612095, 0.250],
				   [1528185612100, 0.375]
				   ], 
				columns = ['unix_time[ms]', 'y1'])
# 1528185612060 -> Tuesday, June 5, 2018 8:00:12.060 AM UTC
df['datetime_utc'] = pd.to_datetime(df['unix_time[ms]'],unit='ms')  # change unit to 's' if seconds
df.set_index('datetime_utc', inplace=True)
#print(df)

# Create a subset based on the values in column 'y1'
df_subset = df.query('y1>=0.0 & y1<0.375')     
print(df_subset)
#                         unix_time[ms]     y1
#datetime_utc
#2018-06-05 08:00:12.085  1528185612085  0.000
#2018-06-05 08:00:12.090  1528185612090  0.125
#2018-06-05 08:00:12.095  1528185612095  0.250



# Extract a subset of rows based on the index between 2018-06-05 08:00:12.085 and 2018-06-05 08:00:12.095
print(df['2018-06-05 08:00:12.085':'2018-06-05 08:00:12.095']     )

# Extract a subset of rows based on the index from 2018-06-05 08:00:12.085 and thereafter
print(df['2018-06-05 08:00:12.085':'2018-06-05 08:00:12.095']     )


# Slice by index row numbers
# df.iloc[startrow:endrow, startcolumn:endcolumn]


# Get a subset of rows based on the numerical index values

import pandas as pd
import numpy as np

df = pd.DataFrame([[1528185612080, -0.0125],
				[1528185612085, 0.0000],
				[1528185612090, 0.125],
				[1528185612095, 0.250],
				[1528185612100, 0.375]
				], 
				columns = ['unix_time[ms]', 'y1'])
# 1528185612060 -> Tuesday, June 5, 2018 8:00:12.060 AM UTC
df['datetime_utc'] = pd.to_datetime(df['unix_time[ms]'],unit='ms')  # change unit to 's' if seconds
df.set_index('unix_time[ms]', inplace=True)
#print(df)
#                   y1            datetime_utc
#unix_time[ms]
#1528185612080 -0.0125 2018-06-05 08:00:12.080
#1528185612085  0.0000 2018-06-05 08:00:12.085
#1528185612090  0.1250 2018-06-05 08:00:12.090
#1528185612095  0.2500 2018-06-05 08:00:12.095
#1528185612100  0.3750 2018-06-05 08:00:12.100

# Get a subset of rows based on the numerical index beginning at 1528185612090
print(df.query('index >= 1528185612090'))
#                  y1            datetime_utc
#unix_time[ms]
#1528185612090  0.125 2018-06-05 08:00:12.090
#1528185612095  0.250 2018-06-05 08:00:12.095
#1528185612100  0.375 2018-06-05 08:00:12.100

Dataseries Boolean Mask


import pandas as pd
import numpy as np

df = pd.DataFrame([[1528185612080, -0.0125],
				   [1528185612085, 0.0000],
				   [1528185612090, 0.125],
				   [1528185612095, -0.0285],
				   [1528185612100, 0.0285]
				   ], 
				columns = ['unix_time[ms]', 'y1'])
# 1528185612060 -> Tuesday, June 5, 2018 8:00:12.060 AM UTC
df['datetime_utc'] = pd.to_datetime(df['unix_time[ms]'],unit='ms')  # change unit to 's' if seconds
df.set_index('datetime_utc', inplace=True)
#df.set_index('unix_time[ms]', inplace=True)

# Create a boolean mask column based on the values in column 'y1'
# and use that to specify markers in the plot (markevery=df_bool_mask).
#df_bool_mask = df['y1'].gt(0.0)     # .gt .ge .lt .le .ne .eq 

# OR

df_bool_mask = df.eval('y1>0.0')     # .eval is similar to .query but creates boolean mask
#print(df_bool_mask)
#datetime_utc
#2018-06-05 08:00:12.080    False
#2018-06-05 08:00:12.085    False
#2018-06-05 08:00:12.090     True
#2018-06-05 08:00:12.095    False
#2018-06-05 08:00:12.100     True

 

Duplicates


import pandas as pd

# Create dataframe with a datetime index of freq 'M' and duplicate
# index value and column'int' values at row 3 (duplicate of row 1) for
df = pd.DataFrame([['2004-03-10 18:00:00',-1.23, 10],
				   ['2004-04-10 18:00:00',-3.38, 9],
				   ['2004-03-10 18:00:00',-2.34, 10],
				   ['2004-08-10 18:00:00',-4.45, 7]], 
				columns = ['timestamp','float', 'int'])

#Convert 'timestamp' string to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])
# Create index from column 'timestamp'
df.set_index('timestamp', inplace=True)

print(df)
#                     float  int
#timestamp
#2004-03-10 18:00:00  -1.23   10
#2004-04-10 18:00:00  -3.38    9
#2004-03-10 18:00:00  -2.34   10
#2004-08-10 18:00:00  -4.45    7

# Find all duplicates and include the source
print(df[df.duplicated(['int'], keep=False)])
#timestamp
#2004-03-10 18:00:00  -1.23   10
#2004-03-10 18:00:00  -2.34   10

# Find all index duplicates and include the source
print(df[df.index.duplicated(keep=False)])
#timestamp
#2004-03-10 18:00:00  -1.23   10
#2004-03-10 18:00:00  -2.34   10

# Find just the index duplicates (ignore the source)
print(df[df.index.duplicated(keep='first')])
#timestamp
#2004-03-10 18:00:00  -2.34   10

# Delete the duplicates, keeping the source (first)
rows_before = df.shape[0]
#df.drop_duplicates(keep='first', inplace=True)     # NO! Deletes more rows than it shoud
df = df[~df.index.duplicated(keep='first')]
print("Deleted " + str(rows_before-df.shape[0]) + " rows, now " + str(df.shape[0]))
#timestamp
#2004-03-10 18:00:00  -2.34   10
#Deleted 1 rows, now 3

print(df)
#timestamp
#2004-03-10 18:00:00  -1.23   10
#2004-04-10 18:00:00  -3.38    9
#2004-08-10 18:00:00  -4.45    7