import pandas as pd # Load the dataset: Order;Lot Area;Street;Neighborhood;Bldg Type;House Style;Overall Qual;Overall Cond;Year Built;Year Remod/Add;1st Flr over Lot Area;1st Flr SF;Mo Sold;Yr Sold;Sale Type;Sale Condition;SalePrice data = pd.read_csv("AmesHousing.csv", sep=';') # View dataset structure print(data.head()) print(data.info()) # Compute "years since built" data['Years Since Built'] = data['Yr Sold'] - data['Year Built'] # Compute "years since remod/add" data['Years Since Remod/Add'] = data['Yr Sold'] - data['Year Remod/Add'] # View the updated dataset print(data[['Years Since Built', 'Years Since Remod/Add']].head()) # Categorize SalePrice into "cheap" and "expensive" data['Price Category'] = data['SalePrice'].apply(lambda x: 'cheap' if x <= 160000 else 'expensive') # View the updated dataset print(data[['SalePrice', 'Price Category']].head()) # Define a threshold for low-frequency values threshold = 5 # Iterate through each column for column in data.columns: # Only process categorical columns (non-numeric, or treat numeric as categorical if needed) if data[column].dtype == 'object' or data[column].nunique() < 20: # Customize this condition for your use case # Count frequency of each value frequencies = data[column].value_counts() # Identify categories with few occurrences low_frequency_values = frequencies[frequencies < threshold].index # Replace infrequent values with "Other" data[column] = data[column].apply(lambda x: 'Other' if x in low_frequency_values else x) # View the dataframe after reclassification print(data.head()) # Threshold for imbalance percentage (e.g., any class with >99% of the data) imbalance_threshold = 0.99 # Identify columns to drop columns_to_drop = [] # Loop through each column in the DataFrame for column in data.columns: # Only analyze categorical variables if data[column].dtype == 'object' or data[column].nunique() < 20: # Compute class distribution class_distribution = data[column].value_counts(normalize=True) # Check if any single class exceeds the imbalance threshold if class_distribution.max() > imbalance_threshold: print(f"Extreme imbalance found in '{column}' (Dropping column)") columns_to_drop.append(column) # You might want to drop other irrelevant variables explicitly # Add them to columns_to_drop if not needed # Example: columns_to_drop.append('Unnamed_column') # Drop the identified columns data = data.drop(columns=columns_to_drop) # Output the cleaned dataset print(f"Columns dropped: {columns_to_drop}") print(data.head())