How to Perform Name Reconciliation Using thefuzz and pandas.
Prerequisites
pip install thefuzz
Data Before Processing
data_left
data_right
user_name | purchase_date | amount |
---|
Alice Smit | 2023-01-01 | 100 |
A Smyth | 2023-01-02 | 200 |
B. Johnson | 2023-01-05 | 150 |
Robert Johnson | 2023-01-06 | 300 |
Charlie Brown | 2023-01-07 | 400 |
Code
from morph import MorphGlobalContext
import morph
import pandas as pd
from thefuzz import process
@morph.func
def data_left(context: MorphGlobalContext):
return pd.DataFrame({
"user_id": [1, 2, 3, 4],
"name": ["Alice Smith", "A. Smyth", "Bob Johnson", "Robert Jr."],
"email": ["alice@example.com", "ann@example.com", "bob@example.com", "rob@example.com"]
})
@morph.func
def data_right(context: MorphGlobalContext):
return pd.DataFrame({
"user_name": ["Alice Smit", "A Smyth", "B. Johnson", "Robert Johnson", "Charlie Brown"],
"purchase_date": ["2023-01-01", "2023-01-02", "2023-01-05", "2023-01-06", "2023-01-07"],
"amount": [100, 200, 150, 300, 400]
})
@morph.func
@morph.load_data("data_left")
@morph.load_data("data_right")
def name_reconciliation_example(context: MorphGlobalContext):
df_left = context.data["data_left"]
df_right = context.data["data_right"]
# ------------------------------------------------------
# 1. Create a Column with Fuzzy-Matched Names
# ------------------------------------------------------
# We'll iterate over 'df_left["name"]' and find the best fuzzy match within 'df_right["user_name"]'.
# The best match is accepted only if the score is >= 80.
unique_right_names = df_right["user_name"].unique()
cutoff_score = 80
def fuzzy_match(name):
best_match, best_score = process.extractOne(name, unique_right_names)
return best_match if best_score >= cutoff_score else None
df_left["matched_name"] = df_left["name"].apply(fuzzy_match)
# ------------------------------------------------------
# 2. Merge the Two DataFrames on the Matched Name
# ------------------------------------------------------
df_merged = pd.merge(
df_left,
df_right,
left_on="matched_name",
right_on="user_name",
how="left"
)
# ------------------------------------------------------
# 3. Analyze Unmatched Records
# ------------------------------------------------------
unmatched = df_merged[df_merged["matched_name"].isnull()]
if not unmatched.empty:
print("=== Unmatched Records ===")
print(unmatched, "\n")
return df_merged
Data After Processing