a

2025-02-14 07:58:45 +01:00
parent 0cd4403cc2
commit c3a717ccbe
135 changed files with 3914 additions and 4 deletions
--- a/WS24_25/PyCharm/pythonProject/P11/excercise.py
+++ b/WS24_25/PyCharm/pythonProject/P11/excercise.py
@@ -0,0 +1,70 @@
+import pandas as pd
+
+# Load the dataset: Order;Lot Area;Street;Neighborhood;Bldg Type;House Style;Overall Qual;Overall Cond;Year Built;Year Remod/Add;1st Flr over Lot Area;1st Flr SF;Mo Sold;Yr Sold;Sale Type;Sale Condition;SalePrice
+data = pd.read_csv("AmesHousing.csv", sep=';')
+# View dataset structure
+print(data.head())
+print(data.info())
+
+# Compute "years since built"
+data['Years Since Built'] = data['Yr Sold'] - data['Year Built']
+
+# Compute "years since remod/add"
+data['Years Since Remod/Add'] = data['Yr Sold'] - data['Year Remod/Add']
+
+# View the updated dataset
+print(data[['Years Since Built', 'Years Since Remod/Add']].head())
+
+# Categorize SalePrice into "cheap" and "expensive"
+data['Price Category'] = data['SalePrice'].apply(lambda x: 'cheap' if x <= 160000 else 'expensive')
+
+# View the updated dataset
+print(data[['SalePrice', 'Price Category']].head())
+
+# Define a threshold for low-frequency values
+threshold = 5
+
+# Iterate through each column
+for column in data.columns:
+    # Only process categorical columns (non-numeric, or treat numeric as categorical if needed)
+    if data[column].dtype == 'object' or data[column].nunique() < 20:  # Customize this condition for your use case
+        # Count frequency of each value
+        frequencies = data[column].value_counts()
+
+        # Identify categories with few occurrences
+        low_frequency_values = frequencies[frequencies < threshold].index
+
+        # Replace infrequent values with "Other"
+        data[column] = data[column].apply(lambda x: 'Other' if x in low_frequency_values else x)
+
+# View the dataframe after reclassification
+print(data.head())
+
+# Threshold for imbalance percentage (e.g., any class with >99% of the data)
+imbalance_threshold = 0.99
+
+# Identify columns to drop
+columns_to_drop = []
+
+# Loop through each column in the DataFrame
+for column in data.columns:
+    # Only analyze categorical variables
+    if data[column].dtype == 'object' or data[column].nunique() < 20:
+        # Compute class distribution
+        class_distribution = data[column].value_counts(normalize=True)
+
+        # Check if any single class exceeds the imbalance threshold
+        if class_distribution.max() > imbalance_threshold:
+            print(f"Extreme imbalance found in '{column}' (Dropping column)")
+            columns_to_drop.append(column)
+
+    # You might want to drop other irrelevant variables explicitly
+    # Add them to columns_to_drop if not needed
+    # Example: columns_to_drop.append('Unnamed_column')
+
+# Drop the identified columns
+data = data.drop(columns=columns_to_drop)
+
+# Output the cleaned dataset
+print(f"Columns dropped: {columns_to_drop}")
+print(data.head())