Currently, I am trying to create a pydantic model for a pandas dataframe. I would like to check if a column is unique by the following
import pandas as pd from typing import List from pydantic import BaseModel class CustomerRecord(BaseModel): id: int name: str address: str class CustomerRecordDF(BaseModel): __root__: List[CustomerRecord] df = pd.DataFrame({'id':[1,2,3], 'name':['Bob','Joe','Justin'], 'address': ['123 Fake St', '125 Fake St', '123 Fake St']}) df_dict = df.to_dict(orient='records') CustomerRecordDF.parse_obj(df_dict)
I would now like to run a validation here and have it fail since address is not unique.
The following returns what I need
from pydantic import root_validator class CustomerRecordDF(BaseModel): __root__: List[CustomerRecord] @root_validator(pre=True) def unique_values(cls, values): root_values = values.get('__root__') value_set = set() for value in root_values: print(value['address']) if value['address'] in value_set: raise ValueError('Duplicate Address') else: value_set.add(value['address']) return values CustomerRecordDF.parse_obj(df_dict) >>> ValidationError: 1 validation error for CustomerRecordDF __root__ Duplicate Address (type=value_error)
but i want to be able to reuse this validator for other other dataframes I create and to also pass in this unique check on multiple columns. Not just address.
Ideally something like the following
from pydantic import root_validator class CustomerRecordDF(BaseModel): __root__: List[CustomerRecord] _validate_unique_name = root_unique_validator('name') _validate_unique_address = root_unique_validator('address')
Advertisement
Answer
You could use an inner function and the allow_reuse
argument:
def root_unique_validator(field): def validator(cls, values): # Use the field arg to validate a specific field ... return root_validator(pre=True, allow_reuse=True)(validator)
Full example:
import pandas as pd from typing import List from pydantic import BaseModel, root_validator class CustomerRecord(BaseModel): id: int name: str address: str def root_unique_validator(field): def validator(cls, values): root_values = values.get("__root__") value_set = set() for value in root_values: if value[field] in value_set: raise ValueError(f"Duplicate {field}") else: value_set.add(value[field]) return values return root_validator(pre=True, allow_reuse=True)(validator) class CustomerRecordDF(BaseModel): __root__: List[CustomerRecord] _validate_unique_name = root_unique_validator("name") _validate_unique_address = root_unique_validator("address") df = pd.DataFrame( { "id": [1, 2, 3], "name": ["Bob", "Joe", "Justin"], "address": ["123 Fake St", "125 Fake St", "123 Fake St"], } ) df_dict = df.to_dict(orient="records") CustomerRecordDF.parse_obj(df_dict) # Output: # pydantic.error_wrappers.ValidationError: 1 validation error for CustomerRecordDF # __root__ # Duplicate address (type=value_error)
And if you use a duplicated name:
# Here goes the most part of the full example above df = pd.DataFrame( { "id": [1, 2, 3], "name": ["Bob", "Joe", "Bob"], "address": ["123 Fake St", "125 Fake St", "127 Fake St"], } ) df_dict = df.to_dict(orient="records") CustomerRecordDF.parse_obj(df_dict) # Output: # pydantic.error_wrappers.ValidationError: 1 validation error for CustomerRecordDF # __root__ # Duplicate name (type=value_error)
You could also receive more than one field
and have a single root validator that validates all the fields. That will probably make the allow_reuse
argument unnecessary.