import pandas as pd orders = pd.DataFrame( { "order_id": [1001, 1002, 1002, 1003, 1004, 1004, 1005], "customer": ["Ada", "Lin", "Lin", "Maya", "Omar", "Omar", "Nia"], "region": ["EMEA", "APAC", "APAC", "AMER", "EMEA", "EMEA", "APAC"], "status": ["paid", "paid", "paid", "open", "paid", "refunded", "open"], "total_usd": [150.0, 240.0, 240.0, 875.0, 95.0, 95.0, 360.0], } ) print(f"pandas {pd.__version__}") print() print("BASE") print(orders.to_string(index=True)) print() exact = orders.drop_duplicates() print("DROP_EXACT_ROWS") print(exact.to_string(index=True)) print() latest_per_order = orders.drop_duplicates( subset=["order_id"], keep="last", ignore_index=True, ) print("KEEP_LAST_BY_ORDER") print(latest_per_order.to_string(index=False)) print() unique_order_only = orders.drop_duplicates( subset=["order_id"], keep=False, ignore_index=True, ) print("DROP_ALL_DUPLICATE_ORDER_IDS") print(unique_order_only.to_string(index=False)) print() print("VERIFY") print(f"original rows: {len(orders)}") print(f"exact rows: {len(exact)}") print(f"latest order_ids: {latest_per_order['order_id'].tolist()}") print(f"latest row index labels: {latest_per_order.index.tolist()}") print(f"unique order_ids only: {unique_order_only['order_id'].tolist()}") print(f"source rows unchanged: {len(orders)}")