import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

import os
print(os.getcwd())

d:\DATA_ANALYSIS_PROJECTS\GIT_PROJECTS\Automobile_Data_Analysis

df = pd.read_csv("D:\\DATA_ANALYSIS_PROJECTS\\GIT_PROJECTS\\Automobile_Data_Analysis\\Automobile_data.csv")

# Display the first 5 rows
print(df.head())

# Display column names and data types
print(df.info())

   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3                 ?  alfa-romero       gas        std          two   
1          3                 ?  alfa-romero       gas        std          two   
2          1                 ?  alfa-romero       gas        std          two   
3          2               164         audi       gas        std         four   
4          2               164         audi       gas        std         four   

    body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
0  convertible          rwd           front        88.6  ...          130   
1  convertible          rwd           front        88.6  ...          130   
2    hatchback          rwd           front        94.5  ...          152   
3        sedan          fwd           front        99.8  ...          109   
4        sedan          4wd           front        99.4  ...          136   

   fuel-system  bore  stroke compression-ratio horsepower  peak-rpm city-mpg  \
0         mpfi  3.47    2.68               9.0        111      5000       21   
1         mpfi  3.47    2.68               9.0        111      5000       21   
2         mpfi  2.68    3.47               9.0        154      5000       19   
3         mpfi  3.19     3.4              10.0        102      5500       24   
4         mpfi  3.19     3.4               8.0        115      5500       18   

  highway-mpg  price  
0          27  13495  
1          27  16500  
2          26  16500  
3          30  13950  
4          22  17450  

[5 rows x 26 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non-null    int64  
 17  fuel-system        205 non-null    object 
 18  bore               205 non-null    object 
 19  stroke             205 non-null    object 
 20  compression-ratio  205 non-null    float64
 21  horsepower         205 non-null    object 
 22  peak-rpm           205 non-null    object 
 23  city-mpg           205 non-null    int64  
 24  highway-mpg        205 non-null    int64  
 25  price              205 non-null    object 
dtypes: float64(5), int64(5), object(16)
memory usage: 41.8+ KB
None

def clean_automobile_data(df):
    # Replace '?' with NaN
    df = df.replace('?', pd.NA)
    
    # Convert all columns to numeric where possible
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    
    # Select only numeric columns
    numeric_df = df.select_dtypes(include='number')
    
    # Drop rows with missing numeric values
    numeric_df = numeric_df.dropna()
    
    return numeric_df

# Use the function and print the result
numeric_df = clean_automobile_data(df)
print(numeric_df.head())

    symboling  normalized-losses  wheel-base  length  width  height  \
3           2              164.0        99.8   176.6   66.2    54.3   
4           2              164.0        99.4   176.6   66.4    54.3   
6           1              158.0       105.8   192.7   71.4    55.7   
8           1              158.0       105.8   192.7   71.4    55.9   
10          2              192.0       101.2   176.8   64.8    54.3   

    curb-weight  engine-size  bore  stroke  compression-ratio  horsepower  \
3          2337          109  3.19     3.4               10.0       102.0   
4          2824          136  3.19     3.4                8.0       115.0   
6          2844          136  3.19     3.4                8.5       110.0   
8          3086          131  3.13     3.4                8.3       140.0   
10         2395          108  3.50     2.8                8.8       101.0   

    peak-rpm  city-mpg  highway-mpg    price  
3     5500.0        24           30  13950.0  
4     5500.0        18           22  17450.0  
6     5500.0        19           25  17710.0  
8     5500.0        17           20  23875.0  
10    5800.0        23           29  16430.0

C:\Users\Win10.DESKTOP-S18D2M2\AppData\Local\Temp\ipykernel_16708\265821505.py:7: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead
  df[col] = pd.to_numeric(df[col], errors='ignore')

# Descriptors for Height
height_stats = numeric_df['height'].describe()
print(height_stats)

count    160.000000
mean      53.878750
std        2.276608
min       49.400000
25%       52.000000
50%       54.100000
75%       55.500000
max       59.800000
Name: height, dtype: float64

# Ensure 'height' is numeric and drop NaNs
height_data = pd.to_numeric(df['height'], errors='coerce').dropna()

# Set up the figure with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# --- Distribution Plot ---
sns.histplot(height_data, kde=True, bins=30, ax=axes[0], color='skyblue', edgecolor='black')
axes[0].set_title('Distribution Plot of Height')
axes[0].set_xlabel('Height')
axes[0].set_ylabel('Frequency')

# --- Violin Plot ---
sns.violinplot(x=height_data, ax=axes[1], color='lightcoral')
axes[1].set_title('Violin Plot of Height')
axes[1].set_xlabel('Height')

# --- Box Plot ---
sns.boxplot(x=height_data, ax=axes[2], color='lightgreen')
axes[2].set_title('Box Plot of Height')
axes[2].set_xlabel('Height')

# Improve layout
plt.tight_layout()
plt.show()

# Descriptors for Height
height_stats = numeric_df['horsepower'].describe()
print(height_stats)

count    160.000000
mean      95.875000
std       30.625708
min       48.000000
25%       69.000000
50%       88.000000
75%      114.000000
max      200.000000
Name: horsepower, dtype: float64

# Ensure 'horsepower' is numeric and drop NaNs
horsepower_data = pd.to_numeric(df['horsepower'], errors='coerce').dropna()

# Set up the figure with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# --- Distribution Plot ---
sns.histplot(horsepower_data, kde=True, bins=30, ax=axes[0], color='skyblue', edgecolor='black')
axes[0].set_title('Distribution Plot of Horsepower')
axes[0].set_xlabel('Horsepower')
axes[0].set_ylabel('Frequency')

# --- Violin Plot ---
sns.violinplot(x=horsepower_data, ax=axes[1], color='lightcoral')
axes[1].set_title('Violin Plot of Horsepower')
axes[1].set_xlabel('Horsepower')

# --- Box Plot ---
sns.boxplot(x=horsepower_data, ax=axes[2], color='lightgreen')
axes[2].set_title('Box Plot of Horsepower')
axes[2].set_xlabel('Horsepower')

# Improve layout
plt.tight_layout()
plt.show()

# Descriptors for Height
price_stats = numeric_df['price'].describe()
print(price_stats)

count      160.000000
mean     11427.681250
std       5863.789011
min       5118.000000
25%       7383.500000
50%       9164.000000
75%      14559.250000
max      35056.000000
Name: price, dtype: float64

# Ensure 'price' is numeric and drop NaNs
price_data = pd.to_numeric(df['price'], errors='coerce').dropna()

# Set up the figure with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# --- Distribution Plot ---
sns.histplot(price_data, kde=True, bins=30, ax=axes[0], color='skyblue', edgecolor='black')
axes[0].set_title('Distribution Plot of Price')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Frequency')

# --- Violin Plot ---
sns.violinplot(x=price_data, ax=axes[1], color='lightcoral')
axes[1].set_title('Violin Plot of Price')
axes[1].set_xlabel('Price')

# --- Box Plot ---
sns.boxplot(x=price_data, ax=axes[2], color='lightgreen')
axes[2].set_title('Box Plot of Price')
axes[2].set_xlabel('Price')

# Improve layout
plt.tight_layout()
plt.show()

# Ensure 'price' and 'horsepower' are numeric and clean the data
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
scatter_df = df[['horsepower', 'price']].dropna()

# Create the scatter plot with trend line
plt.figure(figsize=(6, 4))
sns.regplot(data=scatter_df, x='horsepower', y='price', scatter_kws={'alpha':0.6}, line_kws={"color":"red"})

# Format the Y-axis with 'K' labels
plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x/1000)}K'))

# Customize the plot
plt.title('Price vs Horsepower with Trend Line')
plt.xlabel('Horsepower')
plt.ylabel('Price')
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Convert columns to appropriate types
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['drive-wheels'] = df['drive-wheels'].astype('category')

# Filter and drop missing values
box_df = df[['drive-wheels', 'price']].dropna()

# Define order
order = ['fwd', '4wd', 'rwd']

# Format the X-axis with 'K' labels
plt.gca().xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x/1000)}K'))

# Create the box plot
sns.boxplot(data=box_df, y='drive-wheels', x='price', order=order)

plt.show()

# Convert to numeric where needed
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['engine-size'] = pd.to_numeric(df['engine-size'], errors='coerce')
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

# Filter relevant columns and drop missing values
pair_df = df[['price', 'drive-wheels', 'num-of-cylinders', 'engine-size', 'horsepower']].dropna()

# Optional: encode categorical columns as ordered categories
# For num-of-cylinders, map text to numbers (e.g., 'four' → 4)
cylinder_map = {
    'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'eight': 8, 'twelve': 12
}
pair_df['num-of-cylinders'] = pair_df['num-of-cylinders'].map(cylinder_map)

# Create pair plot
sns.pairplot(pair_df, hue='drive-wheels')
plt.show()

# Compute the correlation matrix
correlation_matrix = numeric_df.corr()

# Show correlation matrix
print(correlation_matrix)

                   symboling  normalized-losses  wheel-base    length  \
symboling           1.000000           0.518388   -0.520465 -0.336217   
normalized-losses   0.518388           1.000000   -0.064001  0.029114   
wheel-base         -0.520465          -0.064001    1.000000  0.871968   
length             -0.336217           0.029114    0.871968  1.000000   
width              -0.219850           0.104857    0.815935  0.839184   
height             -0.473994          -0.417081    0.558764  0.505156   
curb-weight        -0.252372           0.122860    0.810507  0.870355   
engine-size        -0.110238           0.203841    0.650488  0.726666   
bore               -0.257013          -0.036167    0.580484  0.649059   
stroke             -0.020539           0.065627    0.164012  0.116049   
compression-ratio  -0.139022          -0.129971    0.293968  0.188968   
horsepower         -0.003669           0.290906    0.514507  0.666726   
peak-rpm            0.199798           0.240676   -0.292491 -0.239104   
city-mpg            0.088912          -0.236934   -0.576635 -0.716877   
highway-mpg         0.149309          -0.189691   -0.608270 -0.717831   
price              -0.163329           0.199924    0.734789  0.760323   

                      width    height  curb-weight  engine-size      bore  \
symboling         -0.219850 -0.473994    -0.252372    -0.110238 -0.257013   
normalized-losses  0.104857 -0.417081     0.122860     0.203841 -0.036167   
wheel-base         0.815935  0.558764     0.810507     0.650488  0.580484   
length             0.839184  0.505156     0.870355     0.726666  0.649059   
width              1.000000  0.298403     0.870649     0.780018  0.575048   
height             0.298403  1.000000     0.369363     0.116505  0.261501   
curb-weight        0.870649  0.369363     1.000000     0.888847  0.646640   
engine-size        0.780018  0.116505     0.888847     1.000000  0.597336   
bore               0.575048  0.261501     0.646640     0.597336  1.000000   
stroke             0.192891 -0.095364     0.171691     0.296693 -0.105464   
compression-ratio  0.261530  0.237432     0.226513     0.143568  0.019216   
horsepower         0.678779  0.032264     0.788509     0.809855  0.557107   
peak-rpm          -0.235906 -0.251236    -0.262086    -0.287260 -0.315841   
city-mpg          -0.662123 -0.194559    -0.759538    -0.695890 -0.585618   
highway-mpg       -0.689367 -0.221646    -0.787167    -0.711364 -0.586729   
price              0.843316  0.247500     0.893810     0.841725  0.534891   

                     stroke  compression-ratio  horsepower  peak-rpm  \
symboling         -0.020539          -0.139022   -0.003669  0.199798   
normalized-losses  0.065627          -0.129971    0.290906  0.240676   
wheel-base         0.164012           0.293968    0.514507 -0.292491   
length             0.116049           0.188968    0.666726 -0.239104   
width              0.192891           0.261530    0.678779 -0.235906   
height            -0.095364           0.237432    0.032264 -0.251236   
curb-weight        0.171691           0.226513    0.788509 -0.262086   
engine-size        0.296693           0.143568    0.809855 -0.287260   
bore              -0.105464           0.019216    0.557107 -0.315841   
stroke             1.000000           0.240895    0.149315 -0.008569   
compression-ratio  0.240895           1.000000   -0.162894 -0.418726   
horsepower         0.149315          -0.162894    1.000000  0.074932   
peak-rpm          -0.008569          -0.418726    0.074932  1.000000   
city-mpg          -0.021381           0.279513   -0.837180 -0.054938   
highway-mpg       -0.013974           0.222442   -0.827973 -0.034372   
price              0.158798           0.210948    0.758582 -0.173970   

                   city-mpg  highway-mpg     price  
symboling          0.088912     0.149309 -0.163329  
normalized-losses -0.236934    -0.189691  0.199924  
wheel-base        -0.576635    -0.608270  0.734789  
length            -0.716877    -0.717831  0.760323  
width             -0.662123    -0.689367  0.843316  
height            -0.194559    -0.221646  0.247500  
curb-weight       -0.759538    -0.787167  0.893810  
engine-size       -0.695890    -0.711364  0.841725  
bore              -0.585618    -0.586729  0.534891  
stroke            -0.021381    -0.013974  0.158798  
compression-ratio  0.279513     0.222442  0.210948  
horsepower        -0.837180    -0.827973  0.758582  
peak-rpm          -0.054938    -0.034372 -0.173970  
city-mpg           1.000000     0.971997 -0.690100  
highway-mpg        0.971997     1.000000 -0.718314  
price             -0.690100    -0.718314  1.000000

# Set the figure size
plt.figure(figsize=(12, 10))

# Plot heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5)

# Show plot
plt.title('Correlation Heatmap of Automobile Dataset')
plt.show()

Automobile Data Analysis Report¶

Introduction¶

Scenario¶

Step 1 : ASK¶

Business Task¶

Key Questions¶

Step 2: PREPARE & PROCESS¶

Data Selection¶

Limitations of the Data¶

Environment¶

Import Libraries¶

Load the dataset¶

Preview and Understand Data Structure¶

Problems Identified in the Data¶

Data Cleaning¶

Data Cleaning Pipeline¶

3.1 Univariate Analysis¶

Descriptive Statistics: Height¶

Distribution, Violin, and Box Plots for Height¶

Descriptive Statistics: Horsepower¶

Distribution, Violin, and Box Plots for Horsepower¶

Descriptive Statistics: Price¶

Distribution, Violin, and Box Plots for Power¶

3.2 Bivariate Analysis¶

Scatter Plot with Regression Line: Price vs Horsepower¶

Box Plot: Price by Drive Wheels¶

3.3 Multivariate Analysis¶

Pair Plot (Scatter Matrix) with Categorical Hue¶

Correlation Matrix Calculation¶

Correlation Heatmap¶

Step 4: ACT¶

Key Insights¶

Recommendations¶

Automobile Data Analysis Report¶

Introduction¶

Scenario¶

Step 1 : ASK¶

Business Task¶

Key Questions¶

Step 2: PREPARE & PROCESS¶

Data Selection¶

Limitations of the Data¶

Environment¶

Import Libraries¶

Load the dataset¶

Preview and Understand Data Structure¶

Problems Identified in the Data¶

Data Cleaning¶

Data Cleaning Pipeline¶

Step 3: ANALYSE & SHARE¶

3.1 Univariate Analysis¶

Descriptive Statistics: Height¶

Distribution, Violin, and Box Plots for Height¶

Descriptive Statistics: Horsepower¶

Distribution, Violin, and Box Plots for Horsepower¶

Descriptive Statistics: Price¶

Distribution, Violin, and Box Plots for Power¶

3.2 Bivariate Analysis¶

Scatter Plot with Regression Line: Price vs Horsepower¶

Box Plot: Price by Drive Wheels¶

3.3 Multivariate Analysis¶

Pair Plot (Scatter Matrix) with Categorical Hue¶

Correlation Matrix Calculation¶

Correlation Heatmap¶

Step 4: ACT¶

Key Insights¶

Recommendations¶