Compare commits
4 Commits
4cae4e9e28
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 8f50cee48b | |||
| 42b3b2891c | |||
| c3a717ccbe | |||
| 0cd4403cc2 |
2
.idea/libraries/openjfx_javafx_base.xml
generated
2
.idea/libraries/openjfx_javafx_base.xml
generated
@@ -3,7 +3,7 @@
|
||||
<properties maven-id="org.openjfx:javafx-base:20-ea+2" />
|
||||
<CLASSES>
|
||||
<root url="jar://$MAVEN_REPOSITORY$/org/openjfx/javafx-base/20-ea+2/javafx-base-20-ea+2.jar!/" />
|
||||
<root url="jar://$MAVEN_REPOSITORY$/org/openjfx/javafx-base/20-ea+2/javafx-base-20-ea+2-linux.jar!/" />
|
||||
<root url="jar://$MAVEN_REPOSITORY$/org/openjfx/javafx-base/20-ea+2/javafx-base-20-ea+2-win-x86-monocle.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
|
||||
2
WS24_25/PyCharm/pythonProject/.idea/misc.xml
generated
2
WS24_25/PyCharm/pythonProject/.idea/misc.xml
generated
@@ -3,5 +3,5 @@
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.12 (pythonProject)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (pythonProject)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
@@ -5,7 +5,7 @@
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.11 (pythonProject)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.13" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2931
WS24_25/PyCharm/pythonProject/P11/AmesHousing.csv
Normal file
2931
WS24_25/PyCharm/pythonProject/P11/AmesHousing.csv
Normal file
File diff suppressed because it is too large
Load Diff
70
WS24_25/PyCharm/pythonProject/P11/excercise.py
Normal file
70
WS24_25/PyCharm/pythonProject/P11/excercise.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import pandas as pd
|
||||
|
||||
# Load the dataset: Order;Lot Area;Street;Neighborhood;Bldg Type;House Style;Overall Qual;Overall Cond;Year Built;Year Remod/Add;1st Flr over Lot Area;1st Flr SF;Mo Sold;Yr Sold;Sale Type;Sale Condition;SalePrice
|
||||
data = pd.read_csv("AmesHousing.csv", sep=';')
|
||||
# View dataset structure
|
||||
print(data.head())
|
||||
print(data.info())
|
||||
|
||||
# Compute "years since built"
|
||||
data['Years Since Built'] = data['Yr Sold'] - data['Year Built']
|
||||
|
||||
# Compute "years since remod/add"
|
||||
data['Years Since Remod/Add'] = data['Yr Sold'] - data['Year Remod/Add']
|
||||
|
||||
# View the updated dataset
|
||||
print(data[['Years Since Built', 'Years Since Remod/Add']].head())
|
||||
|
||||
# Categorize SalePrice into "cheap" and "expensive"
|
||||
data['Price Category'] = data['SalePrice'].apply(lambda x: 'cheap' if x <= 160000 else 'expensive')
|
||||
|
||||
# View the updated dataset
|
||||
print(data[['SalePrice', 'Price Category']].head())
|
||||
|
||||
# Define a threshold for low-frequency values
|
||||
threshold = 5
|
||||
|
||||
# Iterate through each column
|
||||
for column in data.columns:
|
||||
# Only process categorical columns (non-numeric, or treat numeric as categorical if needed)
|
||||
if data[column].dtype == 'object' or data[column].nunique() < 20: # Customize this condition for your use case
|
||||
# Count frequency of each value
|
||||
frequencies = data[column].value_counts()
|
||||
|
||||
# Identify categories with few occurrences
|
||||
low_frequency_values = frequencies[frequencies < threshold].index
|
||||
|
||||
# Replace infrequent values with "Other"
|
||||
data[column] = data[column].apply(lambda x: 'Other' if x in low_frequency_values else x)
|
||||
|
||||
# View the dataframe after reclassification
|
||||
print(data.head())
|
||||
|
||||
# Threshold for imbalance percentage (e.g., any class with >99% of the data)
|
||||
imbalance_threshold = 0.99
|
||||
|
||||
# Identify columns to drop
|
||||
columns_to_drop = []
|
||||
|
||||
# Loop through each column in the DataFrame
|
||||
for column in data.columns:
|
||||
# Only analyze categorical variables
|
||||
if data[column].dtype == 'object' or data[column].nunique() < 20:
|
||||
# Compute class distribution
|
||||
class_distribution = data[column].value_counts(normalize=True)
|
||||
|
||||
# Check if any single class exceeds the imbalance threshold
|
||||
if class_distribution.max() > imbalance_threshold:
|
||||
print(f"Extreme imbalance found in '{column}' (Dropping column)")
|
||||
columns_to_drop.append(column)
|
||||
|
||||
# You might want to drop other irrelevant variables explicitly
|
||||
# Add them to columns_to_drop if not needed
|
||||
# Example: columns_to_drop.append('Unnamed_column')
|
||||
|
||||
# Drop the identified columns
|
||||
data = data.drop(columns=columns_to_drop)
|
||||
|
||||
# Output the cleaned dataset
|
||||
print(f"Columns dropped: {columns_to_drop}")
|
||||
print(data.head())
|
||||
@@ -1,4 +1,4 @@
|
||||
Order;Lot Area;Street;Neighborhood;Bldg Type;House Style;Overall Qual;Overall Cond;Year Built;Year Remod/Add;1st Flr over Lot Area;1st Flr SF;Mo Sold;Yr Sold;Sale Type;Sale Condition;SalePrice
|
||||
Order;Lot Area;Street;Neighborhood;Bldg Type;House Style;Overall Qual;Overall Cond;Year Built;Year Remod/Add;1st Flr over Lot Area;1st Flr SF;Mo Sold;Yr_Sold;Sale Type;Sale Condition;SalePrice
|
||||
1;31770;Pave;NAmes;1Fam;1Story;6;5;1960;1960;0,05;1656;5;2010;WD ;Normal;215000
|
||||
2;11622;Pave;NAmes;1Fam;1Story;5;6;1961;1961;0,08;896;6;2010;WD ;Normal;105000
|
||||
3;14267;Pave;NAmes;1Fam;1Story;6;6;1958;1958;0,09;1329;6;2010;WD ;Normal;172000
|
||||
|
||||
|
Submodule WS24_25/SWTD/BuchRedesign deleted from 5781acef2b
Submodule WS24_25/SWTD/buchv1 deleted from 3353f56cb0
Reference in New Issue
Block a user