Import modules¶

In [2]:
import pandas as pd
In [3]:
import matplotlib.pyplot as plt

Read File Data¶

In [4]:
all_data = pd.read_excel("./Dataset/SuperStoreData.xlsx", sheet_name="Orders")
all_data.head()
Out[4]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country/Region City ... Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
0 1 CA-2018-152156 2018-11-08 2018-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson ... 42420.0 South FUR-BO-10001798 Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136
1 2 CA-2018-152156 2018-11-08 2018-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson ... 42420.0 South FUR-CH-10000454 Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820
2 3 CA-2018-138688 2018-06-12 2018-06-16 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles ... 90036.0 West OFF-LA-10000240 Office Supplies Labels Self-Adhesive Address Labels for Typewriters b... 14.6200 2 0.00 6.8714
3 4 US-2017-108966 2017-10-11 2017-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... 33311.0 South FUR-TA-10000577 Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310
4 5 US-2017-108966 2017-10-11 2017-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... 33311.0 South OFF-ST-10000760 Office Supplies Storage Eldon Fold 'N Roll Cart System 22.3680 2 0.20 2.5164

5 rows × 21 columns

Information about data¶

In [9]:
all_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9983 entries, 0 to 9993
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row ID          9983 non-null   int64         
 1   Order ID        9983 non-null   object        
 2   Order Date      9983 non-null   datetime64[ns]
 3   Ship Date       9983 non-null   datetime64[ns]
 4   Ship Mode       9983 non-null   object        
 5   Customer ID     9983 non-null   object        
 6   Customer Name   9983 non-null   object        
 7   Segment         9983 non-null   object        
 8   Country/Region  9983 non-null   object        
 9   City            9983 non-null   object        
 10  State           9983 non-null   object        
 11  Postal Code     9983 non-null   float64       
 12  Region          9983 non-null   object        
 13  Product ID      9983 non-null   object        
 14  Category        9983 non-null   object        
 15  Sub-Category    9983 non-null   object        
 16  Product Name    9983 non-null   object        
 17  Sales           9983 non-null   float64       
 18  Quantity        9983 non-null   int64         
 19  Discount        9983 non-null   float64       
 20  Profit          9983 non-null   float64       
 21  Year            9983 non-null   object        
dtypes: datetime64[ns](2), float64(4), int64(2), object(14)
memory usage: 1.8+ MB

Cleaning and modification of data¶

In [5]:
all_data = all_data.dropna(how="any")
In [7]:
all_data['Category'] = all_data['Category'].apply(lambda x: str(x))
In [8]:
all_data['Year'] = all_data['Order Date'].apply(lambda x: str(x)[0:4])

Analysis of data¶

In [10]:
sales_by_category = all_data[['Category', 'Sales']].groupby(['Category']).sum()
sales_by_category.head()
Out[10]:
Sales
Category
Furniture 736879.6953
Office Supplies 716837.5220
Technology 834554.2730
In [11]:
sales_by_region = all_data[['Region', 'Sales']].groupby(['Region']).sum()
sales_by_region.head()
Out[11]:
Sales
Region
Central 501239.8908
East 669851.8700
South 391721.9050
West 725457.8245
In [12]:
sales_by_subcategory = all_data[['Sub-Category', 'Sales']].groupby(['Sub-Category']).sum()
sales_by_subcategory.head()
Out[12]:
Sales
Sub-Category
Accessories 167075.3080
Appliances 106989.2210
Art 27110.7520
Binders 203412.7330
Bookcases 110475.0963
In [21]:
sales_by_year = all_data[['Year', 'Sales', 'Profit']].groupby('Year').sum()
sales_by_year.head()
Out[21]:
Sales Profit
Year
2016 484247.4981 49543.9741
2017 465412.4090 60426.6767
2018 606238.5380 81006.0989
2019 732373.0452 93175.2937

Ploting graph¶

Sales By Category¶

In [12]:
categories = all_data["Category"].unique()
plt.bar(categories, sales_by_category["Sales"], color ='lightblue')
plt.xlabel("Categories")
plt.ylabel("Sales")
plt.title("Sales By Category")
plt.show()

Sales by region¶

In [29]:
regions = all_data.sort_values('Region', ascending = True)["Region"].unique()
regions
plt.bar(regions, sales_by_region["Sales"], color ='lightblue')
plt.xlabel("Region")
plt.ylabel("Sales")
plt.title("Sales By Region")
plt.show()`

Sales by sub category¶

In [27]:
subcategories = all_data.sort_values('Sub-Category', ascending = True)["Sub-Category"].unique()
plt.bar(subcategories, sales_by_subcategory["Sales"], color ='lightblue')
plt.xlabel("Sub Category")
plt.ylabel("Sales")
plt.xticks(rotation='vertical')
plt.title("Sales By Sub Category")
plt.show()

Sales by year¶

In [23]:
years = all_data.sort_values('Year', ascending = True)["Year"].unique()
plt.bar(years, sales_by_year["Sales"], color ='lightblue')
plt.xlabel("Year")
plt.ylabel("Sales")
plt.title("Sales By Year")
plt.show()
In [25]:
years = all_data.sort_values('Year', ascending = True)["Year"].unique()
plt.plot(years, sales_by_year["Sales"], color ='lightblue')
plt.plot(years, sales_by_year["Profit"])
plt.xlabel("Year")
plt.ylabel("Values")
plt.title("Yearly profit and sales")
plt.legend(["Sales", "Profit"])
plt.show()