import pandas as pd
import matplotlib.pyplot as plt
all_data = pd.read_excel("./Dataset/SuperStoreData.xlsx", sheet_name="Orders")
all_data.head()
Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country/Region | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2018-152156 | 2018-11-08 | 2018-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
1 | 2 | CA-2018-152156 | 2018-11-08 | 2018-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
2 | 3 | CA-2018-138688 | 2018-06-12 | 2018-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
3 | 4 | US-2017-108966 | 2017-10-11 | 2017-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
4 | 5 | US-2017-108966 | 2017-10-11 | 2017-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
all_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 9983 entries, 0 to 9993 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9983 non-null int64 1 Order ID 9983 non-null object 2 Order Date 9983 non-null datetime64[ns] 3 Ship Date 9983 non-null datetime64[ns] 4 Ship Mode 9983 non-null object 5 Customer ID 9983 non-null object 6 Customer Name 9983 non-null object 7 Segment 9983 non-null object 8 Country/Region 9983 non-null object 9 City 9983 non-null object 10 State 9983 non-null object 11 Postal Code 9983 non-null float64 12 Region 9983 non-null object 13 Product ID 9983 non-null object 14 Category 9983 non-null object 15 Sub-Category 9983 non-null object 16 Product Name 9983 non-null object 17 Sales 9983 non-null float64 18 Quantity 9983 non-null int64 19 Discount 9983 non-null float64 20 Profit 9983 non-null float64 21 Year 9983 non-null object dtypes: datetime64[ns](2), float64(4), int64(2), object(14) memory usage: 1.8+ MB
all_data = all_data.dropna(how="any")
all_data['Category'] = all_data['Category'].apply(lambda x: str(x))
all_data['Year'] = all_data['Order Date'].apply(lambda x: str(x)[0:4])
sales_by_category = all_data[['Category', 'Sales']].groupby(['Category']).sum()
sales_by_category.head()
Sales | |
---|---|
Category | |
Furniture | 736879.6953 |
Office Supplies | 716837.5220 |
Technology | 834554.2730 |
sales_by_region = all_data[['Region', 'Sales']].groupby(['Region']).sum()
sales_by_region.head()
Sales | |
---|---|
Region | |
Central | 501239.8908 |
East | 669851.8700 |
South | 391721.9050 |
West | 725457.8245 |
sales_by_subcategory = all_data[['Sub-Category', 'Sales']].groupby(['Sub-Category']).sum()
sales_by_subcategory.head()
Sales | |
---|---|
Sub-Category | |
Accessories | 167075.3080 |
Appliances | 106989.2210 |
Art | 27110.7520 |
Binders | 203412.7330 |
Bookcases | 110475.0963 |
sales_by_year = all_data[['Year', 'Sales', 'Profit']].groupby('Year').sum()
sales_by_year.head()
Sales | Profit | |
---|---|---|
Year | ||
2016 | 484247.4981 | 49543.9741 |
2017 | 465412.4090 | 60426.6767 |
2018 | 606238.5380 | 81006.0989 |
2019 | 732373.0452 | 93175.2937 |
categories = all_data["Category"].unique()
plt.bar(categories, sales_by_category["Sales"], color ='lightblue')
plt.xlabel("Categories")
plt.ylabel("Sales")
plt.title("Sales By Category")
plt.show()
regions = all_data.sort_values('Region', ascending = True)["Region"].unique()
regions
plt.bar(regions, sales_by_region["Sales"], color ='lightblue')
plt.xlabel("Region")
plt.ylabel("Sales")
plt.title("Sales By Region")
plt.show()`
subcategories = all_data.sort_values('Sub-Category', ascending = True)["Sub-Category"].unique()
plt.bar(subcategories, sales_by_subcategory["Sales"], color ='lightblue')
plt.xlabel("Sub Category")
plt.ylabel("Sales")
plt.xticks(rotation='vertical')
plt.title("Sales By Sub Category")
plt.show()
years = all_data.sort_values('Year', ascending = True)["Year"].unique()
plt.bar(years, sales_by_year["Sales"], color ='lightblue')
plt.xlabel("Year")
plt.ylabel("Sales")
plt.title("Sales By Year")
plt.show()
years = all_data.sort_values('Year', ascending = True)["Year"].unique()
plt.plot(years, sales_by_year["Sales"], color ='lightblue')
plt.plot(years, sales_by_year["Profit"])
plt.xlabel("Year")
plt.ylabel("Values")
plt.title("Yearly profit and sales")
plt.legend(["Sales", "Profit"])
plt.show()