Data Days for Good 2023, MassMutual
“Analyzing CS literacy in MA districts % counties wise addressing financial or social inequities
- Filling Missing Values
- Required Binning of atleast 1 CS Class Participation into 3 Bins
- Creation of Others Dataframe (Asian + White)
- Joining of Dataframes (CS Classes Participation + AP Test Scores)
- Pandas Profiling
- Dual Box Plots Creation on Basis of CS Course Offered or CS Course Not-Offered:
!ls
import pandas as pd
df1 = pd.read_excel("datasets/AfricAmerican-Black/artcourse-afrc-amer-cs.xlsx", header=1)
df3 = pd.read_excel("datasets/Hispanic-Latino/artcourse-hisplat-cs.xlsx", header=1)
df1.head(5)
df3.head(5)
import pandas as pd
df1_ap = pd.read_excel("datasets/AfricAmerican-Black/ap_performance_afr_amerc.xlsx", header=1)
df3_ap = pd.read_excel("datasets/Hispanic-Latino/ap_performance_hisp_latino.xlsx", header=1)
df1_ap.head(5)
df3_ap.head(5)
import pandas as pd
dfw = pd.read_excel("datasets/Others-(Asian + White)/artcourse-white.xlsx", header=1)
dfa = pd.read_excel("datasets/Others-(Asian + White)/artcourse-asian.xlsx", header=1)
import pandas as pd
dfw_ap = pd.read_excel("datasets/Others-(Asian + White)/ap_performance_white.xlsx", header=1)
dfa_ap = pd.read_excel("datasets/Others-(Asian + White)/ap_performance_asian.xlsx", header=1)
print("Data types of dfw_ap:")
print(dfw_ap.dtypes)
print("\nLength of dfw_ap:", len(dfw_ap))
print("\nData types of dfa_ap:")
print(dfa_ap.dtypes)
print("\nLength of dfa_ap:", len(dfa_ap))
print("\nData types of dfw:")
print(dfw.dtypes)
print("\nLength of dfw:", len(dfw))
print("\nData types of dfa:")
print(dfa.dtypes)
print("\nLength of dfa:", len(dfa))
# dfw & dfw_ap
columns = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', 'All Grades', 'Total Students']
for column in columns:
dfw[column] = dfw[column].str.replace(',', '').str.strip()
dfw[column] = dfw[column].astype(float)
columns = ['Tests Taken', 'Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5']
for column in columns:
dfw_ap[column] = dfw_ap[column].str.replace(',', '').str.strip()
dfw_ap[column] = dfw_ap[column].astype(float)
# dfa & dfa_ap
columns = ['05', '06', '07', '08', 'All Grades', 'Total Students']
for column in columns:
dfa[column] = dfa[column].str.replace(',', '').str.strip()
dfa[column] = dfa[column].astype(float)
columns = ['Tests Taken', 'Score=4', 'Score=5']
for column in columns:
dfa_ap[column] = dfa_ap[column].str.replace(',', '').str.strip()
dfa_ap[column] = dfa_ap[column].astype(float)
# Removing commas and whitespaces from 'Tests Taken' column
df1['All Grades'] = df1['All Grades'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df1['All Grades'] = df1['All Grades'].astype(float)
df1['Total Students'] = df1['Total Students'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df1['Total Students'] = df1['Total Students'].astype(float)
df1_ap['Tests Taken'] = df1_ap['Tests Taken'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df1_ap['Tests Taken'] = df1_ap['Tests Taken'].astype(float)
df3_ap['Tests Taken'] = df3_ap['Tests Taken'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3_ap['Tests Taken'] = df3_ap['Tests Taken'].astype(float)
df3['All Grades'] = df3['All Grades'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3['All Grades'] = df3['All Grades'].astype(float)
df3['Total Students'] = df3['Total Students'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3['Total Students'] = df3['Total Students'].astype(float)
df3['09'] = df3['09'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3['09'] = df3['09'].astype(float)
print(df1.dtypes, df3.dtypes, df1_ap.dtypes, df3_ap.dtypes)
print(dfw.dtypes, dfw_ap.dtypes, dfa.dtypes, dfa_ap.dtypes)
print(df1.isnull().sum(), # column wise null check
df3.isnull().sum()) # column wise null check
df1_ap.isnull().sum(), # column wise null check
df3_ap.isnull().sum()#
df3.dtypes
df1['AfriAmerican-Black-%'] = df1['All Grades'] * 100 / df1['Total Students']
import pandas as pd
import matplotlib.pyplot as plt
sorted_df = df1.sort_values(by='AfriAmerican-Black-%', ascending= False)
top_n = 40
top_districts = sorted_df.head(top_n)
plt.figure(figsize=(10, 12)) # Increase the figure size to show more districts
plt.barh(top_districts['District Name'], top_districts['AfriAmerican-Black-%'])
plt.xlabel('AfriAmerican-Black-%')
plt.ylabel('District Name')
plt.title(f'Top {top_n} Districts with Highest African-American Percentage')
plt.show()
df3['Hispanic-Latino-%'] = df3['All Grades'] * 100 / df3['Total Students']
import pandas as pd
import matplotlib.pyplot as plt
sorted_df = df3.sort_values(by='Hispanic-Latino-%', ascending= False)
top_n = 40
top_districts = sorted_df.head(top_n)
# Plot the top districts
plt.figure(figsize=(10, 12)) # Increase the figure size to show more districts
plt.barh(top_districts['District Name'], top_districts['Hispanic-Latino-%'])
plt.xlabel('Hispanic-Latino-%')
plt.ylabel('District Name')
plt.title(f'Top {top_n} Districts with Highest Hispanic-Latino Percentage')
plt.show()
df1_ap.fillna(0, inplace=True)
df3_ap.fillna(0, inplace=True)
df1.fillna(0, inplace=True)
df3.fillna(0, inplace=True)
dfa.fillna(0, inplace=True)
dfw.fillna(0, inplace=True)
dfa_ap.fillna(0, inplace=True)
dfw_ap.fillna(0, inplace=True)
num_zeros1 = (df1_ap == 0).sum().sum()
num_zeros3 = (df3_ap == 0).sum().sum()
num_zeros1, num_zeros3
data_frames = [df1_ap, df3_ap, df1, df3, dfw, dfw_ap, dfa, dfa_ap]
for i in range(4):
print(data_frames[i].describe())
print("-------------------")
df1.dtypes
df1_ap.dtypes
df3.dtypes
selected_columns1 = ['K', '01', '02', '03']
df1['Primary'] = df1[selected_columns1].sum(axis=1)
df3['Primary'] = df3[selected_columns1].sum(axis=1)
dfa['Primary'] = dfa[selected_columns1].sum(axis=1)
dfw['Primary'] = dfw[selected_columns1].sum(axis=1)
df1['Primary'].describe()
selected_columns2 = ['04', '05','06','07','08']
# Computing the sum along columns axis for each row
df1['Secondary'] = df1[selected_columns2].sum(axis=1)
df3['Secondary'] = df3[selected_columns2].sum(axis=1)
dfa['Secondary'] = dfa[selected_columns2].sum(axis=1)
dfw['Secondary'] = dfw[selected_columns2].sum(axis=1)
selected_columns3 = ['09', '10','11','12']
# Computing the sum along columns axis for each row
df1['High'] = df1[selected_columns3].sum(axis=1)
df3['High'] = df3[selected_columns3].sum(axis=1)
dfa['High'] = dfa[selected_columns3].sum(axis=1)
dfw['High'] = dfw[selected_columns3].sum(axis=1)
df3.isnull().sum() # column wise null check
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
df1.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
df3.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dfa.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dfw.drop(columns=columns_to_drop, inplace=True)
dataframes = {'dfa': dfa, 'dfw': dfw, 'df1': df1, 'df3': df3}
for name, df in dataframes.items():
print(f"Data types of DataFrame {name}:")
print(df.dtypes)
print()
dfw.dtypes
import pandas as pd
# Merge the dataframes based on the common columns 'District Name' and 'District Code'
merged_df_o = pd.merge(dfw, dfa, on=['District Name', 'District Code'], how='outer')
merged_df_o.dtypes
# rename_columns = {
# 'Sum_K-3_x': 'Sum_K-3',
# 'Sum_04-08_x': 'Sum_04-08',
# 'Sum_09-12_x': 'Sum_09-12',
# 'Sum_K-3_y': 'Sum_K-3',
# 'Sum_04-08_y': 'Sum_04-08',
# 'Sum_09-12_y': 'Sum_09-12',
# 'Total Students_x': 'Total Students',
# 'All Grades_x': 'All Grades',
# 'Total Students_y': 'Total Students',
# 'All Grades_y': 'All Grades',
# }
# merged_df_o = merged_df_o.rename(columns=rename_columns)
merged_df_o
merged_df_o.dtypes
merged_df_o['All Grades'] = merged_df_o['All Grades_x'] + merged_df_o['All Grades_y']
merged_df_o['Total Students'] = merged_df_o['Total Students_x'] + merged_df_o['Total Students_y']
merged_df_o['Primary'] = merged_df_o['Primary_x'] + merged_df_o['Primary_y']
merged_df_o['Secondary'] = merged_df_o['Secondary_x'] + merged_df_o['Secondary_y']
merged_df_o['High'] = merged_df_o['High_x'] + merged_df_o['High_y']
merged_df_o.drop(['All Grades_x', 'All Grades_y', 'Total Students_x', 'Total Students_y', 'Primary_x', 'Primary_y', 'Secondary_x', 'Secondary_y', 'High_x', 'High_y'], axis=1, inplace=True)
merged_df_o
merged_df_o['Others-%'] = merged_df_o['All Grades'] * 100 / merged_df_o['Total Students']
merged_df_o.dtypes
merged_df_o.fillna(0, inplace=True)
!ls
merged_df_o.to_excel('merged_df_others_cs-class.xlsx', index=False)
nan_values = merged_df_o.isnull().sum()
print(nan_values)
len(dfw_ap)
len(dfa_ap)
dfw_ap.dtypes
dfa_ap.dtypes
merged_df_o_ap = pd.merge(dfw_ap, dfa_ap, on=['District Name', 'District Code'], how='outer')
merged_df_o_ap.dtypes
merged_df_o_ap['Tests Taken'] = merged_df_o_ap['Tests Taken_x'] + merged_df_o_ap['Tests Taken_y']
merged_df_o_ap['Score=1'] = merged_df_o_ap['Score=1_x'] + merged_df_o_ap['Score=1_y']
merged_df_o_ap['Score=2'] = merged_df_o_ap['Score=2_x'] + merged_df_o_ap['Score=2_y']
merged_df_o_ap['Score=3'] = merged_df_o_ap['Score=3_x'] + merged_df_o_ap['Score=3_y']
merged_df_o_ap['Score=4'] = merged_df_o_ap['Score=4_x'] + merged_df_o_ap['Score=4_y']
merged_df_o_ap['Score=5'] = merged_df_o_ap['Score=5_x'] + merged_df_o_ap['Score=5_y']
merged_df_o_ap['Tests Taken'] = merged_df_o_ap['Tests Taken_x'] + merged_df_o_ap['Tests Taken_y']
merged_df_o_ap['Score=1'] = merged_df_o_ap['Score=1_x'] + merged_df_o_ap['Score=1_y']
merged_df_o_ap['Score=2'] = merged_df_o_ap['Score=2_x'] + merged_df_o_ap['Score=2_y']
merged_df_o_ap['Score=3'] = merged_df_o_ap['Score=3_x'] + merged_df_o_ap['Score=3_y']
merged_df_o_ap['Score=4'] = merged_df_o_ap['Score=4_x'] + merged_df_o_ap['Score=4_y']
merged_df_o_ap['Score=5'] = merged_df_o_ap['Score=5_x'] + merged_df_o_ap['Score=5_y']
merged_df_o_ap.drop(['Tests Taken_x', 'Tests Taken_y', 'Score=1_x', 'Score=1_y', 'Score=2_x', 'Score=2_y', 'Score=3_x', 'Score=3_y', 'Score=4_x', 'Score=4_y', 'Score=5_x', 'Score=5_y', '% Score 1-2_x', '% Score 1-2_y', '% Score 3-5_x', '% Score 3-5_y'], axis=1, inplace=True)
merged_df_o_ap.dtypes
merged_df_o_ap['% Score 1-2'] = ((merged_df_o_ap['Score=1'] + merged_df_o_ap['Score=2'])/merged_df_o_ap['Tests Taken'])
merged_df_o_ap['% Score 3-5'] = ((merged_df_o_ap['Score=3'] + merged_df_o_ap['Score=4'] + merged_df_o_ap['Score=5'])/merged_df_o_ap['Tests Taken'])
merged_df_o_ap.dtypes
merged_df_o_ap.fillna(0, inplace=True)
merged_df_o_ap.drop(['Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5'], axis=1, inplace=True)
merged_df_o_ap.dtypes
!ls
merged_df_o_ap.to_excel('merged_df_ap_performance_others.xlsx', index=False)
len(merged_df_o_ap)
len(merged_df_o)
data_frames = {'df1': df1, 'df1_ap': df1_ap, 'df3': df3, 'df3_ap': df3_ap}
for df_name, df in data_frames.items():
unique_values = df['District Code'].nunique()
print("Number of unique values in 'District Code' for", df_name, ":", unique_values)
df1_ap.dtypes
# df3_ap.fillna(0, inplace=True)
# df1.isnull().sum() # column wise null check
df3_ap.isnull().sum()
df3_ap.dtypes
columns_to_drop = ['Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5']
df1_ap.drop(columns=columns_to_drop, inplace=True)
df3_ap.drop(columns=columns_to_drop, inplace=True)
df1_ap.dtypes
df3_ap.dtypes
len(merged_df_o_ap)
len(merged_df_o)
merged_df_others_o_ap = merged_df_o.merge(merged_df_o_ap, on='District Code', how='inner')
len(df1)
len(df1_ap)
len(df3)
len(df3_ap)
# Join 1: inner
merged_df1_i = df1.merge(df1_ap, on='District Code', how='inner')
merged_df3_i = df3.merge(df3_ap, on='District Code', how='inner')
merged_df3_i.head(5)
merged_df1_i.head(5)
merged_df_o_ap.head(5)
merged_df_o.head(5)
merged_df1_l = df1.merge(df1_ap, on='District Code', how='left')
merged_df3_l = df3.merge(df3_ap, on='District Code', how='left')
merged_df_others_o_ap_left = merged_df_o.merge(merged_df_o_ap, on='District Code', how='inner')
len(merged_df_others_o_ap_left)
len(merged_df1_l)
len(merged_df3_l)
merged_df_others_o_ap_left
merged_df_others_o_ap.fillna(0, inplace=True)
merged_df1_l.fillna(0, inplace=True)
merged_df3_l.fillna(0, inplace=True)
merged_df1_l.isnull().sum()
merged_df3_l.isnull().sum()
merged_df_others_o_ap.isnull().sum()
merged_df3_l.head(5)
merged_df_others_o_ap.head(5)
merged_df1_l.head(5)
merged_df3_l.head(5)
!ls
merged_df1_i.dtypes
!ls
merged_df1_i.to_excel('merged_df_Afric-American.xlsx', index=False)
merged_df3_i.to_excel('merged_df_Hispanic-Latino.xlsx', index=False)
merged_df_others_o_ap.to_excel('merged_df_Others-(Asian+White).xlsx', index=False)
import matplotlib.pyplot as plt
scatter_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3')
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08')
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12')
# Adding labels and legend
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
# Displaying the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Assuming you already have the DataFrame 'merged_df2'
# Extracting the required columns
scatter_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12', alpha=0.5)
# Adjusting x-axis scale
plt.xlim(0, 200)
# Adding labels, legend, and gridlines
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
plt.grid(True)
# Displaying the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Assuming you already have the DataFrame 'merged_df2'
# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Extract the required columns
x = merged_df2['Tests Taken']
y = merged_df2['Sum_K-3']
z = merged_df2['Sum_04-08']
c = merged_df2['Sum_09-12']
# Scatter plot with color-coded points based on 'Sum_09-12'
scatter = ax.scatter(x, y, z, c=c, cmap='viridis')
# Set labels and title
ax.set_xlabel('Tests Taken')
ax.set_ylabel('Sum_K-3')
ax.set_zlabel('Sum_04-08')
ax.set_title('3D Scatter Plot')
# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Sum_09-12')
# Show the plot
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming you already have the DataFrame 'merged_df2'
# Select the required columns
heatmap_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Compute the correlation matrix
correlation_matrix = heatmap_data.corr()
# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# Set the title
plt.title('Heatmap of Correlation')
# Show the plot
plt.show()
# from pandas_profiling import ProfileReport
# # Assuming you already have the DataFrame 'merged_df2'
# # Generate the pandas profiling report
# profile = ProfileReport(merged_df2, title='Pandas Profiling Report')
# # Display the report as an interactive widget form
# profile.to_widgets()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Assuming you already have the DataFrame 'merged_df2'
# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Extract the required columns
x = merged_df2['Tests Taken']
y = merged_df2['Sum_K-3']
z = merged_df2['Sum_04-08']
c = merged_df2['Sum_09-12']
labels = merged_df2['District Name_x']
# Scatter plot with color-coded points based on 'Sum_09-12'
scatter = ax.scatter(x, y, z, c=c, cmap='viridis')
# Add labels to data points
for i, label in enumerate(labels):
ax.text(x[i], y[i], z[i], label, color='black', fontsize=8, ha='center', va='center')
# Set labels and title
ax.set_xlabel('Tests Taken')
ax.set_ylabel('Sum_K-3')
ax.set_zlabel('Sum_04-08')
ax.set_title('3D Scatter Plot')
# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Sum_09-12')
# Show the plot
plt.show()
merged_df1_i.dtypes
merged_df_others_o_ap.dtypes
# column_index_to_drop = 7 # Specify the index of the column you want to drop
# merged_df_others_o_ap.drop(merged_df_others_o_ap.columns[column_index_to_drop], axis=1, inplace=True)
# columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# merged_df1_i.drop(columns=columns_to_drop, inplace=True)
# columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# merged_df3_i.drop(columns=columns_to_drop, inplace=True)
merged_df1_i.dtypes
merged_df3_i.dtypes
merged_df3_i.dtypes
merged_df1_i.dtypes
# merged_df1_i = merged_df1_i.drop(['CS course Binary for Sum_K-3', 'CS course Binary for Sum_04-08', 'CS course Binary for Sum_09-12'], axis=1)
!pip install -U ydata-profiling
!ls
from ydata_profiling import ProfileReport
profile1 = ProfileReport(merged_df1_i, title="District-wise Profiling Report of African American/Black CS Course Takers", minimal= False)
profile1.to_file("District-wise_African_American_CS_Course_Takers_Profiling_Report.html")
profile2 = ProfileReport(merged_df3_i, title="District-wise Profiling Report of Hispanic/Latino CS Course Takers", minimal= False)
profile2.to_file("District-wise_Hispanic_Latino_CS_Course_Takers_Profiling_Report.html")
profile3 = ProfileReport(merged_df_others_o_ap, title="District-wise Profiling Report of Others(Asian & White) CS Course Takers", minimal= False)
profile3.to_file("District-wise_Others_CS_Course_Takers_Profiling_Report.html")
!ls
from PIL import Image
import matplotlib.pyplot as plt
img = Image.open('image.jpg')
plt.imshow(img)
plt.title('Image Title')
Afri-Amer-Black.png Hispanic-Latino.png Others-Asian-White.png
from PIL import Image
import matplotlib.pyplot as plt
img1 = Image.open('Afri-Amer-Black.png')
plt.imshow(img1)
plt.title('African American/Black CS Course Takers')
plt.show()
Take Aways from Above Heatmap of African-American/Black:
- High School has high Correlation with All grades
merged_df1_i.dtypes
import matplotlib.pyplot as plt
# Filtering 'Secondary' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['Secondary'] >= 0) & (merged_df1_i['Secondary'] <= 100)]
# Scatter plot
plt.scatter(filtered_merged_df1_i['Secondary'], filtered_merged_df1_i['% Score 3-5'])
plt.xlabel('Secondary')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: % Score 3-5 vs. Secondary')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'Secondary' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['Secondary'] >= 0) & (merged_df1_i['Secondary'] <= 100)]
# Scatter plot using Seaborn
sns.scatterplot(x='Secondary', y='% Score 3-5', data=filtered_merged_df1_i)
plt.xlabel('Secondary')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Secondary vs % Score 3-5')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'High' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['High'] >= 0) & (merged_df1_i['High'] <= 100)]
# Scatter plot using Seaborn
sns.scatterplot(x='High', y='% Score 3-5', data=filtered_merged_df1_i)
plt.xlabel('High')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: High vs % Score 3-5')
plt.grid(True)
plt.show()
merged_df1_i.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'High' values to be within the range 0-200
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['High'] >= 0) & (merged_df1_i['High'] <= 200)]
# Scatter plot using Seaborn
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df1_i)
plt.xlabel('High')
plt.ylabel('All Grades')
plt.title('Scatter Plot: High vs All Grades')
plt.grid(True)
plt.show()
The Above Scatter Plot Showcases Clear Correlation between High School and All Grades indicating this Ethnic Group contribution to ALL grades is higher in later classes than primary or secondary
merged_df1_i.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot using Seaborn
sns.scatterplot(x='AfriAmerican-Black-%', y='% Score 3-5', data=merged_df1_i)
plt.xlabel('AfriAmerican-Black-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: AfriAmerican-Black-% vs Passing Percentage')
plt.grid(True)
plt.show()
from PIL import Image
import matplotlib.pyplot as plt
img2 = Image.open('Hispanic-Latino.png')
plt.imshow(img2)
plt.title('Hispanic/Latino CS Course Takers')
plt.show()
Take Aways from Above Heatmap of Hispanic/Latino:
- High School has high Correlation with All grades
merged_df3_i.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot using Seaborn
sns.scatterplot(x='Hispanic-Latino-%', y='% Score 3-5', data=merged_df3_i)
plt.xlabel('Hispanic-Latino-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Hispanic-Latino-% vs Passing Percentage')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'High' values to be within the range 0-200
filtered_merged_df3_i = merged_df3_i[(merged_df3_i['High'] >= 0) & (merged_df3_i['High'] <= 200)]
filtered_merged_df3_i_primary = merged_df3_i[(merged_df3_i['Primary'] >= 0) & (merged_df3_i['Primary'] <= 200)]
filtered_merged_df3_i_secondary = merged_df3_i[(merged_df3_i['Secondary'] >= 0) & (merged_df3_i['Secondary'] <= 200)]
# Scatter plot: High vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df3_i)
plt.xlabel('High School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between High School Students and All Grades')
plt.grid(True)
plt.show()
# Scatter plot: Primary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Primary', y='All Grades', data=filtered_merged_df3_i_primary)
plt.xlabel('Primary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Primary School Students and All Grades')
plt.grid(True)
plt.show()
# Scatter plot: Secondary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Secondary', y='All Grades', data=filtered_merged_df3_i_secondary)
plt.xlabel('Secondary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Secondary School Students and All Grades')
plt.grid(True)
plt.show()
The Above 3 Scatter Plots Showcases a Strong Correlation between High School and All Grades & High Overall Correlation between secondary & All grades.
from PIL import Image
import matplotlib.pyplot as plt
img3 = Image.open('Others-Asian-White.png')
plt.imshow(img3)
plt.title('Other/Asian/White CS Course Takers')
plt.show()
Others (Asian & White):
- Percentage of District wise participation in CS Classes is Highly Proportional to All grades
- Secondary is highly proportional to all grades & Primary is also contributes to all grades but less than secondary
merged_df_others_o_ap.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'High' values to be within the range 0-200
filtered_merged_df_others_o_ap = merged_df_others_o_ap[(merged_df_others_o_ap['High'] >= 0) & (merged_df_others_o_ap['High'] <= 200)]
filtered_merged_df_others_o_ap_primary = merged_df_others_o_ap[(merged_df_others_o_ap['Primary'] >= 0) & (merged_df_others_o_ap['Primary'] <= 200)]
filtered_merged_df_others_o_ap_secondary = merged_df_others_o_ap[(merged_df_others_o_ap['Secondary'] >= 0) & (merged_df_others_o_ap['Secondary'] <= 200)]
# Scatter plot: High vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df_others_o_ap)
plt.xlabel('High School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between High School Students and All Grades')
plt.grid(True)
plt.show()
# Scatter plot: Primary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Primary', y='All Grades', data=filtered_merged_df_others_o_ap_primary)
plt.xlabel('Primary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Primary School Students and All Grades')
plt.grid(True)
plt.show()
# Scatter plot: Secondary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Secondary', y='All Grades', data=filtered_merged_df_others_o_ap_secondary)
plt.xlabel('Secondary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Secondary School Students and All Grades')
plt.grid(True)
plt.show()
The Above 3 Scatter Plots Showcases a Strong Overall Correlation between High School, Secondary & Primary vs All grades.
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot using Seaborn
sns.scatterplot(x='Others-%', y='% Score 3-5', data=merged_df_others_o_ap)
plt.xlabel('Others-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Others(Asian&White)-% vs Passing Percentage')
plt.grid(True)
plt.show()
print(merged_df_others_o_ap.dtypes, merged_df1_i.dtypes, merged_df3_i.dtypes)
merged_df1_i.dtypes
!ls
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data and assign binary logic for every different category
merged_df1_i['CS course Binary for Primary'] = merged_df1_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for Secondary'] = merged_df1_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for High'] = merged_df1_i['High'].apply(lambda x: 1 if x > 0 else 0)
# Filter the data for the two variables
data_cs_course_offered = merged_df1_i[merged_df1_i['CS course Binary for Primary'] == 1]
data_cs_course_not_offered = merged_df1_i[merged_df1_i['CS course Binary for Primary'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Primary Schools and % of Scores 3-5 for African American Group')
plt.savefig('African-American-Box Plot(Primaryvs3-5%).png')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data and assign binary logic for every different category
merged_df1_i['CS course Binary for Primary'] = merged_df1_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for Secondary'] = merged_df1_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for High'] = merged_df1_i['High'].apply(lambda x: 1 if x > 0 else 0)
# Filter the data for the two variables
data_cs_course_offered = merged_df1_i[merged_df1_i['CS course Binary for Secondary'] == 1]
data_cs_course_not_offered = merged_df1_i[merged_df1_i['CS course Binary for Secondary'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Secondary Schools and % of Scores 3-5 for African American Group')
plt.savefig('African-American-Box Plot(Secondary vs 3-5%).png')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data and assign binary logic for every different category
merged_df1_i['CS course Binary for Primary'] = merged_df1_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for Secondary'] = merged_df1_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for High'] = merged_df1_i['High'].apply(lambda x: 1 if x > 0 else 0)
# Filter the data for the two variables
data_cs_course_offered = merged_df1_i[merged_df1_i['CS course Binary for High'] == 1]
data_cs_course_not_offered = merged_df1_i[merged_df1_i['CS course Binary for High'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of High Schools and % of Scores 3-5 for African American Group')
plt.savefig('African-American-Box Plot(High vs 3-5%).png')
plt.show()
!ls
import os
!ls
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data and assign binary logic for every different category
merged_df3_i['CS course Binary for Primary'] = merged_df3_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df3_i['CS course Binary for Secondary'] = merged_df3_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df3_i['CS course Binary for High'] = merged_df3_i['High'].apply(lambda x: 1 if x > 0 else 0)
# Filter the data for the two variables
data_cs_course_offered = merged_df3_i[merged_df3_i['CS course Binary for Primary'] == 1]
data_cs_course_not_offered = merged_df3_i[merged_df3_i['CS course Binary for Primary'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Primary Schools and % of Scores 3-5 for Hispanics/Latino Group')
# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots' # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True) # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Hispanic_Latino-Box-Plot(Primaryvs3-5%).png')
plt.savefig(output_file_path)
# Display the plot
plt.show()
data_cs_course_offered = merged_df3_i[merged_df3_i['CS course Binary for Secondary'] == 1]
data_cs_course_not_offered = merged_df3_i[merged_df3_i['CS course Binary for Secondary'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Secondary Schools and % of Scores 3-5 for Hispanics/Latino Group')
# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots' # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True) # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Hispanic_Latino-Box-Plot(Secondaryvs3-5%).png')
plt.savefig(output_file_path)
# Display the plot
plt.show()
data_cs_course_offered = merged_df3_i[merged_df3_i['CS course Binary for High'] == 1]
data_cs_course_not_offered = merged_df3_i[merged_df3_i['CS course Binary for High'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of High Schools and % of Scores 3-5 for Hispanics/Latino Group')
# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots' # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True) # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Hispanic_Latino-Box-Plot(Highvs3-5%).png')
plt.savefig(output_file_path)
# Display the plot
plt.show()
merged_df_others_o_ap.dtypes
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data and assign binary logic for every different category
merged_df_others_o_ap['CS course Binary for Primary'] = merged_df_others_o_ap['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df_others_o_ap['CS course Binary for Secondary'] = merged_df_others_o_ap['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df_others_o_ap['CS course Binary for High'] = merged_df_others_o_ap['High'].apply(lambda x: 1 if x > 0 else 0)
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data for the two variables
data_cs_course_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Primary'] == 1]
data_cs_course_not_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Primary'] == 0]
# Create a figure and axes
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
# Set the x-axis label
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Primary School Category and % of Scores 3-5 for Others Ethnic Group (Asian + White)')
# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots' # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True) # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Others-Box-Plot(Primaryvs3-5%).png')
plt.savefig(output_file_path)
# Display the plot
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data for the two variables
data_cs_course_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Secondary'] == 1]
data_cs_course_not_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Secondary'] == 0]
# Create a figure and axes
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
# Set the x-axis label
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Secondary School Category and % of Scores 3-5 for Others Ethnic Group (Asian + White)')
# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots' # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True) # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Others-Box-Plot(Secondaryvs3-5%).png')
plt.savefig(output_file_path)
# Display the plot
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data for the two variables
data_cs_course_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for High'] == 1]
data_cs_course_not_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for High'] == 0]
# Create a figure and axes
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
flier.set(marker='o', markersize=5)
# Set the x-axis label
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of High School Category and % of Scores 3-5 for Others Ethnic Group (Asian + White)')
# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots' # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True) # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Others-Box-Plot(Highvs3-5%).png')
plt.savefig(output_file_path)
# Display the plot
plt.show()
!ls
from ydata_profiling import ProfileReport, compare
comparison_report = compare([profile1, profile2, profile3])
# Save the comparison report to a file
comparison_report.to_file("Comparison-AfrAmer-HispBlack-Others.html")
import pandas as pd
import matplotlib.pyplot as plt
# Assuming you already have the DataFrame 'merged_df1'
# Extracting the required columns
scatter_data = merged_df1[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12', alpha=0.5)
# Adjusting x-axis scale
plt.xlim(0, 25)
# Adding labels, legend, and gridlines
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
plt.grid(True)
# Displaying the plot
plt.show()
merged_df1.head(10)
merged_df1['Percent-District-Afr-Amer'] = merged_df1['All Grades'] * 100 / merged_df1['Total Students']
merged_df1.dtypes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Selecting the required columns
scatter_data = merged_df1[['District Name_x', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]
# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='District Name_x')
# Displaying the plots
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Selecting the required columns
scatter_data = merged_df2[['District Name_x', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]
# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='District Name_x')
# Displaying the plots
plt.show()
district_county_map = {
"Abby Kelley Foster Charter Public (District)": "Worcester",
"Abington": "Plymouth",
"Academy Of the Pacific Rim Charter Public (District)": "Suffolk",
"Acton-Boxborough": "Middlesex",
"Acushnet": "Bristol",
"Advanced Math and Science Academy Charter (District)": "Middlesex",
"Agawam": "Hampden",
"Alma del Mar Charter School (District)": "Bristol",
"Amesbury": "Essex",
"Amherst": "Hampshire",
"Amherst-Pelham": "Hampshire",
"Andover": "Essex",
"Argosy Collegiate Charter School (District)": "Suffolk",
"Arlington": "Middlesex",
"Ashburnham-Westminster": "Worcester",
"Ashland": "Middlesex",
"Assabet Valley Regional Vocational Technical": "Middlesex",
"Athol-Royalston": "Worcester",
"Atlantis Charter (District)": "Bristol",
"Attleboro": "Bristol",
"Auburn": "Worcester",
"Avon": "Norfolk",
"Ayer Shirley School District": "Middlesex",
"Barnstable": "Barnstable",
"Baystate Academy Charter Public School (District)": "Hampden",
"Bedford": "Middlesex",
"Belchertown": "Hampshire",
"Bellingham": "Norfolk",
"Belmont": "Middlesex",
"Benjamin Banneker Charter Public (District)": "Suffolk",
"Benjamin Franklin Classical Charter Public (District)": "Bristol",
"Berkley": "Bristol",
"Berkshire Arts and Technology Charter Public (District)": "Berkshire",
"Berkshire Hills": "Berkshire",
"Berlin-Boylston": "Worcester",
"Beverly": "Essex",
"Billerica": "Middlesex",
"Blackstone Valley Regional Vocational Technical": "Worcester",
"Blackstone-Millville": "Worcester",
"Blue Hills Regional Vocational Technical": "Norfolk",
"Boston": "Suffolk",
"Boston Collegiate Charter (District)": "Suffolk",
"Boston Day and Evening Academy Charter (District)": "Suffolk",
"Boston Green Academy Horace Mann Charter School (District)": "Suffolk",
"Boston Preparatory Charter Public (District)": "SuffolBostonk",
"Boston Renaissance Charter Public (District)": "Suffolk",
"Bourne": "Barnstable",
"Boxford": "Essex",
"Braintree": "Norfolk",
"Brewster": "Barnstable",
"Bridge Boston Charter School (District)": "Suffolk",
"Bridgewater-Raynham": "Plymouth",
"Brimfield": "Hampden",
"Bristol County Agricultural": "Bristol",
"Bristol-Plymouth Regional Vocational Technical": "Bristol",
"Brockton": "Plymouth",
"Brooke Charter School (District)": "Suffolk",
"Brookfield": "Worcester",
"Brookline": "Norfolk",
"Burlington": "Middlesex",
"Cambridge": "Middlesex",
"Canton": "Norfolk",
"Cape Cod Lighthouse Charter (District)": "Barnstable",
"Cape Cod Regional Vocational Technical": "Barnstable",
"Carlisle": "Middlesex",
"Carver": "Plymouth",
"Central Berkshire": "Berkshire",
"Chelmsford": "Middlesex",
"Chelsea": "Suffolk",
"Chesterfield-Goshen": "Hampshire",
"Chicopee": "Hampden",
"Christa McAuliffe Charter Public (District)": "Middlesex",
"City on a Hill Charter Public School (District)": "Suffolk",
"Clarksburg": "Berkshire",
"Clinton": "Worcester",
"Codman Academy Charter Public (District)": "Suffolk",
"Cohasset": "Norfolk",
"Collegiate Charter School of Lowell (District)": "Middlesex",
"Community Charter School of Cambridge (District)": "Middlesex",
"Community Day Charter Public School - Gateway (District)": "Essex",
"Community Day Charter Public School - Prospect (District)": "Essex",
"Community Day Charter Public School - R. Kingman Webster (District)": "Essex",
"Concord": "Middlesex",
"Concord-Carlisle": "Middlesex",
"Conservatory Lab Charter (District)": "Suffolk",
"Conway": "Franklin",
"Danvers": "Essex",
"Dartmouth": "Bristol",
"Dedham": "Norfolk",
"Deerfield": "Franklin",
"Dennis-Yarmouth": "Barnstable",
"Dighton-Rehoboth": "Bristol",
"Douglas": "Worcester",
"Dover": "Norfolk",
"Dover-Sherborn": "Norfolk",
"Dracut": "Middlesex",
"Dudley Street Neighborhood Charter School (District)": "Suffolk",
"Dudley-Charlton Reg": "Worcester",
"Duxbury": "Plymouth",
"East Bridgewater": "Plymouth",
"East Longmeadow": "Hampden",
"Eastham": "Barnstable",
"Easthampton": "Hampshire",
"Easton": "Bristol",
"Edgartown": "Dukes",
"Edward M. Kennedy Academy for Health Careers (Horace Mann Charter) (District)": "Suffolk",
"Erving": "Franklin",
"Essex North Shore Agricultural and Technical School District": "Essex",
"Everett": "Middlesex",
"Excel Academy Charter (District)": "Suffolk",
"Fairhaven": "Bristol",
"Fall River": "Bristol",
"Falmouth": "Barnstable",
"Farmington River Reg": "Berkshire",
"Fitchburg": "Worcester",
"Four Rivers Charter Public (District)": "Franklin",
"Foxborough": "Norfolk",
"Foxborough Regional Charter (District)": "Norfolk",
"Framingham": "Middlesex",
"Francis W. Parker Charter Essential (District)": "Middlesex",
"Franklin": "Norfolk",
"Franklin County Regional Vocational Technical": "Franklin",
"Freetown-Lakeville": "Plymouth",
"Frontier": "Franklin",
"Gardner": "Worcester",
"Gateway": "Hampshire",
"Georgetown": "Essex",
"Gill-Montague": "Franklin",
"Global Learning Charter Public (District)": "Bristol",
"Gloucester": "Essex",
"Grafton": "Worcester",
"Granby": "Hampshire",
"Greater Commonwealth Virtual District": "Berkshire",
"Greater Fall River Regional Vocational Technical": "Bristol",
"Greater Lawrence Regional Vocational Technical": "Essex",
"Greater Lowell Regional Vocational Technical": "Middlesex",
"Greater New Bedford Regional Vocational Technical": "Bristol",
"Greenfield": "Franklin",
"Groton-Dunstable": "Middlesex",
"Hadley": "Hampshire",
"Halifax": "Plymouth",
"Hamilton-Wenham": "Essex",
"Hampden Charter School of Science East (District)": "Hampden",
"Hampden Charter School of Science West (District)": "Hampden",
"Hampden-Wilbraham": "Hampden",
"Hampshire": "Hampshire",
"Hancock": "Berkshire",
"Hanover": "Plymouth",
"Harvard": "Worcester",
"Hatfield": "Hampshire",
"Haverhill": "Essex",
"Hawlemont": "Franklin",
"Helen Y. Davis Leadership Academy Charter Public (District)": "Suffolk",
"Hill View Montessori Charter Public (District)": "Essex",
"Hilltown Cooperative Charter Public (District)": "Hampshire",
"Hingham": "Plymouth",
"Holbrook": "Norfolk",
"Holland": "Hampden",
"Holliston": "Middlesex",
"Holyoke": "Hampden",
"Holyoke Community Charter (District)": "Hampden",
"Hoosac Valley Regional": "Berkshire",
"Hopedale": "Worcester",
"Hopkinton": "Middlesex",
"Hudson": "Middlesex",
"Hull": "Plymouth",
"Innovation Academy Charter (District)": "Essex",
"Ipswich": "Essex",
"KIPP Academy Boston Charter School (District)": "Suffolk",
"KIPP Academy Lynn Charter (District)": "Essex",
"King Philip": "Norfolk",
"Kingston": "Plymouth",
"Lawrence": "Essex",
"Lawrence Family Development Charter (District)": "Essex",
"Learning First Charter Public School (District)": "Worcester",
"Lee": "Berkshire",
"Leicester": "Worcester",
"Lenox": "Berkshire",
"Leominster": "Worcester",
"Leverett": "Franklin",
"Lexington": "Middlesex",
"Libertas Academy Charter School (District)": "Middlesex",
"Lincoln": "Middlesex",
"Lincoln-Sudbury": "Middlesex",
"Littleton": "Middlesex",
"Longmeadow": "Hampden",
"Lowell": "Middlesex",
"Lowell Community Charter Public (District)": "Middlesex",
"Lowell Middlesex Academy Charter (District)": "Middlesex",
"Ludlow": "Hampden",
"Lunenburg": "Worcester",
"Lynn": "Essex",
"Lynnfield": "Essex",
"Malden": "Middlesex",
"Manchester Essex Regional": "Essex",
"Mansfield": "Bristol",
"Marblehead": "Essex",
"Marion": "Plymouth",
"Marlborough": "Middlesex",
"Marshfield": "Plymouth",
"Martha's Vineyard": "Dukes",
"Martin Luther King Jr. Charter School of Excellence (District)": "Suffolk",
"Masconomet": "Essex",
"Mashpee": "Barnstable",
"Mass Academy of Math and Science at WPI (District)": "Worcester",
"Mat-Su Borough": "Anchorage",
"Matter and Form Charter School (District)": "Suffolk",
"Mattapoisett": "Plymouth",
"Maynard": "Middlesex",
"Medfield": "Norfolk",
"Medford": "Middlesex",
"Medway": "Norfolk",
"Melrose": "Middlesex",
"Memorial School District": "Bristol",
"Mendon-Upton": "Worcester",
"Methuen": "Essex",
"Middleborough": "Plymouth",
"Middleton": "Essex",
"Milford": "Worcester",
"Millis": "Norfolk",
"Milton": "Norfolk",
"Mohawk Trail": "Franklin",
"Monomoy Regional": "Barnstable",
"Monson": "Hampden",
"Montachusett Regional Vocational Technical": "Worcester",
"Montague": "Franklin",
"Monterey": "Berkshire",
"Mount Greylock": "Berkshire",
"Nahant": "Essex",
"Nantucket": "Nantucket",
"Narragansett": "Worcester",
"Nashoba": "Middlesex",
"Nashoba Valley Technical": "Middlesex",
"Nashua River Valley Vocational Technical": "Middlesex",
"Natick": "Middlesex",
"Nauset": "Barnstable",
"Needham": "Norfolk",
"New Bedford": "Bristol",
"New England Academy Charter School (District)": "Middlesex",
"New Heights Charter School of Brockton (District)": "Plymouth",
"New Salem-Wendell": "Franklin",
"Newburyport": "Essex",
"Newton": "Middlesex",
"North Adams": "Berkshire",
"North Andover": "Essex",
"North Attleborough": "Bristol",
"North Brookfield": "Worcester",
"North Middlesex": "Middlesex",
"North Reading": "Middlesex",
"Northampton": "Hampshire",
"Northborough": "Worcester",
"Northbridge": "Worcester",
"Norton": "Bristol",
"Norwell": "Plymouth",
"Norwood": "Norfolk",
"Old Colony Regional Vocational Technical": "Bristol",
"Old Rochester": "Plymouth",
"Orange": "Franklin",
"Orleans": "Barnstable",
"Oxford": "Worcester",
"Palmer": "Hampden",
"Pathfinder Regional Vocational Technical": "Worcester",
"Pembroke": "Plymouth",
"Pentucket": "Essex",
"Petersham": "Worcester",
"Pittsfield": "Berkshire",
"Plainville": "Norfolk",
"Plymouth": "Plymouth",
"Plympton": "Plymouth",
"Provincetown": "Barnstable",
"Quaboag Regional": "Worcester",
"Quabbin": "Worcester",
"Quincy": "Norfolk",
"Ralph C. Mahar": "Franklin",
"Randolph": "Norfolk",
"Reading": "Middlesex",
"Revere": "Suffolk",
"Richmond": "Berkshire",
"Rising Tide Charter Public (District)": "Plymouth",
"Rochester": "Plymouth",
"Rockland": "Plymouth",
"Rockport": "Essex",
"Rowe": "Franklin",
"Royalston": "Worcester",
"Russell": "Hampden",
"Rutland": "Worcester",
"Salem": "Essex",
"Salem Academy Charter (District)": "Essex",
"Salisbury": "Essex",
"Sandwich": "Barnstable",
"Saugus": "Essex",
"Savoy": "Berkshire",
"Scituate": "Plymouth",
"Seekonk": "Bristol",
"Sharon": "Norfolk",
"Shawsheen Valley Regional Vocational Technical": "Middlesex",
"Sherborn": "Middlesex",
"Shirley": "Middlesex",
"Shrewsbury": "Worcester",
"Shutesbury": "Franklin",
"Silver Lake": "Plymouth",
"Somerset": "Bristol",
"Somerville": "Middlesex",
"South Hadley": "Hampshire",
"South Middlesex Regional Vocational Technical": "Middlesex",
"South Shore Charter Public (District)": "Plymouth",
"South Shore Regional Vocational Technical": "Plymouth",
"Southborough": "Worcester",
"Southbridge": "Worcester",
"Southwick-Tolland-Granville Regional": "Hampden",
"Spencer-East Brookfield": "Worcester",
"Springfield": "Hampden",
"Springfield Conservatory of the Arts (District)": "Hampden",
"Stoneham": "Middlesex",
"Stoughton": "Norfolk",
"Sturbridge": "Worcester",
"Sudbury": "Middlesex",
"Sunderland": "Franklin",
"Sutton": "Worcester",
"Swampscott": "Essex",
"Swansea": "Bristol",
"Tantasqua": "Worcester",
"Tecumseh": "Lenawee",
"Tewksbury": "Middlesex",
"Tisbury": "Dukes",
"Topsfield": "Essex",
"Tri-County Regional Vocational Technical": "Norfolk",
"Tyngsborough": "Middlesex",
"Uxbridge": "Worcester",
"Wachusett": "Worcester",
"Wales": "Hampden",
"Walpole": "Norfolk",
"Waltham": "Middlesex",
"Ware": "Hampshire",
"Wareham": "Plymouth",
"Warren": "Worcester",
"Warwick": "Franklin",
"Washington": "Berkshire",
"Watertown": "Middlesex",
"Wayland": "Middlesex",
"Webster": "Worcester",
"Wellesley": "Norfolk",
"Wellfleet": "Barnstable",
"West Boylston": "Worcester",
"West Bridgewater": "Plymouth",
"West Brookfield": "Worcester",
"West Springfield": "Hampden",
"West Stockbridge": "Berkshire",
"West Tisbury": "Dukes",
"Westborough": "Worcester",
"Westfield": "Hampden",
"Westford": "Middlesex",
"Westhampton": "Hampshire",
"Westminster": "Worcester",
"Weston": "Middlesex",
"Westport": "Bristol",
"Westwood": "Norfolk",
"Weymouth": "Norfolk",
"Whately": "Franklin",
"Whitman-Hanson": "Plymouth",
"Wilbraham": "Hampden",
"Williamsburg": "Hampshire",
"Williamstown": "Berkshire",
"Wilmington": "Middlesex",
"Winchendon": "Worcester",
"Winchester": "Middlesex",
"Windsor": "Berkshire",
"Winthrop": "Suffolk",
"Woburn": "Middlesex",
"Worcester": "Worcester",
"Worthington": "Hampshire",
"Wrentham": "Norfolk",
"Yarmouth": "Barnstable",
"Florida": "Berkshire",
"MATCH Charter Public School (District)": "Suffolk",
"Ma Academy for Math and Science": "Worcester",
"Map Academy Charter School (District)": "Plymouth",
"Marblehead Community Charter Public (District)": "Essex",
"Martha's Vineyard Charter (District)": "Dukes",
"Millbury": "Worcester",
"Minuteman Regional Vocational Technical": "Middlesex",
"Monomoy Regional School District": "Barnstable",
"Mystic Valley Regional Charter (District)": "Middlesex",
"Nashoba Valley Regional Vocational Technical": "Middlesex",
"Neighborhood House Charter (District)": "Suffolk",
"Norfolk": "Norfolk",
"Norfolk County Agricultural": "Norfolk",
"Northampton-Smith Vocational Agricultural": "Hampshire",
"Northboro-Southboro": "Worcester",
"Northeast Metropolitan Regional Vocational Technical": "Middlesex",
"Northern Berkshire Regional Vocational Technical": "Berkshire",
"Oak Bluffs": "Dukes",
"Old Sturbridge Academy Charter Public School (District)": "Worcester",
"Peabody": "Essex",
"Pelham": "Hampshire",
"Phoenix Academy Public Charter High School Lawrence (District)": "Essex",
"Phoenix Academy Public Charter High School Springfield (District)": "Hampden",
"Phoenix Charter Academy (District)": "Suffolk",
"Pioneer Charter School of Science (District)": "Middlesex",
"Pioneer Charter School of Science II (PCSS-II) (District)": "Essex",
"Pioneer Valley": "Franklin",
"Pioneer Valley Chinese Immersion Charter (District)": "Hampshire",
"Pioneer Valley Performing Arts Charter Public (District)": "Hampshire",
"Prospect Hill Academy Charter (District)": "Middlesex",
"Ralph C Mahar": "Franklin",
"River Valley Charter (District)": "Essex",
"Roxbury Preparatory Charter (District)": "Suffolk",
"Sizer School: A North Central Charter Essential (District)": "Worcester",
"Somerset Berkley Regional School District": "Bristol",
"Southampton": "Hampshire",
"Southeastern Regional Vocational Technical": "Bristol",
"Southern Berkshire": "Berkshire",
"Southern Worcester County Regional Vocational School District": "Worcester",
"Southwick-Tolland-Granville Regional School District": "Hampden",
"Spencer-E Brookfield": "Worcester",
"Springfield International Charter (District)": "Hampden",
"Springfield Preparatory Charter School (District)": "Hampden",
"Sturgis Charter Public (District)": "Barnstable",
"TEC Connections Academy Commonwealth Virtual School District": "Norfolk",
"Taunton": "Bristol",
"Triton": "Essex",
"Truro": "Barnstable",
"UP Academy Charter School of Boston (District)": "Suffolk",
"UP Academy Charter School of Dorchester (District)": "Suffolk",
"Up-Island Regional": "Dukes",
"Upper Cape Cod Regional Vocational Technical": "Barnstable",
"Veritas Preparatory Charter School (District)": "Hampden",
"Wakefield": "Middlesex",
"Whittier Regional Vocational Technical": "Essex"
}
merged_df1['County'] = merged_df1['District Name_x'].map(district_county_map)
merged_df2['County'] = merged_df2['District Name_x'].map(district_county_map)
merged_df1
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Selecting the required columns
scatter_data = merged_df1[['County', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]
# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='County')
# Displaying the plots
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Selecting the required columns
scatter_data = merged_df2[['County', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]
# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='County')
# Displaying the plots
plt.show()
# df1.isnull().sum() # column wise null check
# df1['All Grades'] = df1['All Grades'].str.replace(',', '').astype('float64')
# df1['Total Students'] = df1['Total Students'].str.replace(',', '').astype('float64')
# merged_df1_i = merged_df1_i.drop(['CS course Binary for Sum_K-3', 'CS course Binary for Sum_04-08', 'CS course Binary for Sum_09-12'], axis=1)