70 lines
2.6 KiB
Python
70 lines
2.6 KiB
Python
import pandas as pd
|
|
|
|
# Load the dataset: Order;Lot Area;Street;Neighborhood;Bldg Type;House Style;Overall Qual;Overall Cond;Year Built;Year Remod/Add;1st Flr over Lot Area;1st Flr SF;Mo Sold;Yr Sold;Sale Type;Sale Condition;SalePrice
|
|
data = pd.read_csv("AmesHousing.csv", sep=';')
|
|
# View dataset structure
|
|
print(data.head())
|
|
print(data.info())
|
|
|
|
# Compute "years since built"
|
|
data['Years Since Built'] = data['Yr Sold'] - data['Year Built']
|
|
|
|
# Compute "years since remod/add"
|
|
data['Years Since Remod/Add'] = data['Yr Sold'] - data['Year Remod/Add']
|
|
|
|
# View the updated dataset
|
|
print(data[['Years Since Built', 'Years Since Remod/Add']].head())
|
|
|
|
# Categorize SalePrice into "cheap" and "expensive"
|
|
data['Price Category'] = data['SalePrice'].apply(lambda x: 'cheap' if x <= 160000 else 'expensive')
|
|
|
|
# View the updated dataset
|
|
print(data[['SalePrice', 'Price Category']].head())
|
|
|
|
# Define a threshold for low-frequency values
|
|
threshold = 5
|
|
|
|
# Iterate through each column
|
|
for column in data.columns:
|
|
# Only process categorical columns (non-numeric, or treat numeric as categorical if needed)
|
|
if data[column].dtype == 'object' or data[column].nunique() < 20: # Customize this condition for your use case
|
|
# Count frequency of each value
|
|
frequencies = data[column].value_counts()
|
|
|
|
# Identify categories with few occurrences
|
|
low_frequency_values = frequencies[frequencies < threshold].index
|
|
|
|
# Replace infrequent values with "Other"
|
|
data[column] = data[column].apply(lambda x: 'Other' if x in low_frequency_values else x)
|
|
|
|
# View the dataframe after reclassification
|
|
print(data.head())
|
|
|
|
# Threshold for imbalance percentage (e.g., any class with >99% of the data)
|
|
imbalance_threshold = 0.99
|
|
|
|
# Identify columns to drop
|
|
columns_to_drop = []
|
|
|
|
# Loop through each column in the DataFrame
|
|
for column in data.columns:
|
|
# Only analyze categorical variables
|
|
if data[column].dtype == 'object' or data[column].nunique() < 20:
|
|
# Compute class distribution
|
|
class_distribution = data[column].value_counts(normalize=True)
|
|
|
|
# Check if any single class exceeds the imbalance threshold
|
|
if class_distribution.max() > imbalance_threshold:
|
|
print(f"Extreme imbalance found in '{column}' (Dropping column)")
|
|
columns_to_drop.append(column)
|
|
|
|
# You might want to drop other irrelevant variables explicitly
|
|
# Add them to columns_to_drop if not needed
|
|
# Example: columns_to_drop.append('Unnamed_column')
|
|
|
|
# Drop the identified columns
|
|
data = data.drop(columns=columns_to_drop)
|
|
|
|
# Output the cleaned dataset
|
|
print(f"Columns dropped: {columns_to_drop}")
|
|
print(data.head()) |