project
Testing
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
import plotly.graph_objects as go
ok_cupid_df = pd.read_csv('data/okcupid_profiles.csv')
ok_cupid_df.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
dtype='object')
ok_cupid_df.shape
(59946, 31)
ok_cupid_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 59946 non-null int64
1 status 59946 non-null object
2 sex 59946 non-null object
3 orientation 59946 non-null object
4 body_type 54650 non-null object
5 diet 35551 non-null object
6 drinks 56961 non-null object
7 drugs 45866 non-null object
8 education 53318 non-null object
9 ethnicity 54266 non-null object
10 height 59943 non-null float64
11 income 59946 non-null int64
12 job 51748 non-null object
13 last_online 59946 non-null object
14 location 59946 non-null object
15 offspring 24385 non-null object
16 pets 40025 non-null object
17 religion 39720 non-null object
18 sign 48890 non-null object
19 smokes 54434 non-null object
20 speaks 59896 non-null object
21 essay0 54458 non-null object
22 essay1 52374 non-null object
23 essay2 50308 non-null object
24 essay3 48470 non-null object
25 essay4 49409 non-null object
26 essay5 49096 non-null object
27 essay6 46175 non-null object
28 essay7 47495 non-null object
29 essay8 40721 non-null object
30 essay9 47343 non-null object
dtypes: float64(1), int64(2), object(28)
memory usage: 14.2+ MB
ok_cupid_df.head(2)
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | essay0 | essay1 | essay2 | essay3 | essay4 | essay5 | essay6 | essay7 | essay8 | essay9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | m | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | ... | about me: i would love to think that i was so... | currently working as an international agent fo... | making people laugh. ranting about a good salt... | the way i look. i am a six foot half asian, ha... | books: absurdistan, the republic, of mice and ... | food. water. cell phone. shelter. | duality and humorous things | trying to find someone to hang out with. i am ... | i am new to california and looking for someone... | you want to be swept off your feet! you are ti... |
1 | 35 | single | m | straight | average | mostly other | often | sometimes | working on space camp | white | ... | i am a chef: this is what that means. 1. i am ... | dedicating everyday to being an unbelievable b... | being silly. having ridiculous amonts of fun w... | NaN | i am die hard christopher moore fan. i don't r... | delicious porkness in all of its glories. my b... | NaN | NaN | i am very open and will share just about anyth... | NaN |
2 rows × 31 columns
Check null values:
ok_cupid_df.isna().sum()
age 0
status 0
sex 0
orientation 0
body_type 5296
diet 24395
drinks 2985
drugs 14080
education 6628
ethnicity 5680
height 3
income 0
job 8198
last_online 0
location 0
offspring 35561
pets 19921
religion 20226
sign 11056
smokes 5512
speaks 50
essay0 5488
essay1 7572
essay2 9638
essay3 11476
essay4 10537
essay5 10850
essay6 13771
essay7 12451
essay8 19225
essay9 12603
dtype: int64
ok_cupid_df.isna().sum()/ok_cupid_df.shape[0]*100
age 0.000000
status 0.000000
sex 0.000000
orientation 0.000000
body_type 8.834618
diet 40.694959
drinks 4.979482
drugs 23.487806
education 11.056618
ethnicity 9.475194
height 0.005005
income 0.000000
job 13.675641
last_online 0.000000
location 0.000000
offspring 59.321723
pets 33.231575
religion 33.740366
sign 18.443266
smokes 9.194942
speaks 0.083408
essay0 9.154906
essay1 12.631368
essay2 16.077803
essay3 19.143896
essay4 17.577486
essay5 18.099623
essay6 22.972342
essay7 20.770360
essay8 32.070530
essay9 21.023922
dtype: float64
Fill null values:
ok_cupid_df['drugs'] = ok_cupid_df['drugs'].fillna('unknown_drugs')
ok_cupid_df['drugs'].value_counts()
never 37724
unknown_drugs 14080
sometimes 7732
often 410
Name: drugs, dtype: int64
ok_cupid_df['diet'] = ok_cupid_df['diet'].fillna('unknowndiet')
ok_cupid_df['diet'].value_counts()
unknowndiet 24395
mostly anything 16585
anything 6183
strictly anything 5113
mostly vegetarian 3444
mostly other 1007
strictly vegetarian 875
vegetarian 667
strictly other 452
mostly vegan 338
other 331
strictly vegan 228
vegan 136
mostly kosher 86
mostly halal 48
strictly halal 18
strictly kosher 18
halal 11
kosher 11
Name: diet, dtype: int64
#ok_cupid_df.loc[(ok_cupid_df['diet'] == 'unknown')&(ok_cupid_df['essay0'] == '57'), 'status'] = 'available'
ok_cupid_df.loc[ok_cupid_df['essay0'] == "im looking for someone to share some raging adhd. im a self motivated and light hearted superhero who enjoy's riding my bike everywhere and eating every goddamn thing i can. im looking for someone to go adventuring with. i enjoy blind drunken adventures sometimes but you dont have to be a drinker. no vegans, i will eat anything... including people... especially hipsters. im not really a nerd (i don't play magic cards/excessive videogames) but i can like nerdy girls. i just got this account, so gimmie some time to write down more shenanigans that are important if u make chiptunes hit me the fuck up! i wanna make some! i am awesome, eccentric, and energetic", 'diet'] = 'strictly anything'
ok_cupid_df.loc[ok_cupid_df['essay0'] == "rabid bibliophile, humorless feminist (that's a joke), eternal student. i like to write poetry on people, bake (vegan) cupcakes, make art and dress-up. i identify as queer but my choices here are limited so i chose bisexual. i am quiet, empathetic, and geeky", 'diet'] = 'vegan'
#ok_cupid_df[ok_cupid_df['diet'] == 'unknown_diet']['essay0'].tolist()
ok_cupid_df['status'].value_counts()
single 55697
seeing someone 2064
available 1865
married 310
unknown 10
Name: status, dtype: int64
ok_cupid_df["status"].replace({'unknown': 'unknown_status'}, inplace=True)
ok_cupid_df['status'].value_counts()
single 55697
seeing someone 2064
available 1865
married 310
unknown_status 10
Name: status, dtype: int64
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].fillna('unknown_body_type')
ok_cupid_df['body_type'].value_counts()
average 14652
fit 12711
athletic 11819
unknown_body_type 5296
thin 4711
curvy 3924
a little extra 2629
skinny 1777
full figured 1009
overweight 444
jacked 421
used up 355
rather not say 198
Name: body_type, dtype: int64
ok_cupid_df['education'] = ok_cupid_df['education'].fillna('unknown_education')
ok_cupid_df['education'].value_counts()
graduated from college/university 23959
graduated from masters program 8961
unknown_education 6628
working on college/university 5712
working on masters program 1683
graduated from two-year college 1531
graduated from high school 1428
graduated from ph.d program 1272
graduated from law school 1122
working on two-year college 1074
dropped out of college/university 995
working on ph.d program 983
college/university 801
graduated from space camp 657
dropped out of space camp 523
graduated from med school 446
working on space camp 445
working on law school 269
two-year college 222
working on med school 212
dropped out of two-year college 191
dropped out of masters program 140
masters program 136
dropped out of ph.d program 127
dropped out of high school 102
high school 96
working on high school 87
space camp 58
ph.d program 26
law school 19
dropped out of law school 18
dropped out of med school 12
med school 11
Name: education, dtype: int64
ok_cupid_df['job'] = ok_cupid_df['job'].fillna('unknown_job')
#ok_cupid_df['job'].value_counts()
ok_cupid_df['ethnicity'] = ok_cupid_df['ethnicity'].fillna('unknown_ethnicity')
#ok_cupid_df['ethnicity'].value_counts()
ok_cupid_df['offspring'] = ok_cupid_df['offspring'].fillna('unknown_offspring')
#ok_cupid_df['offspring'].value_counts()
ok_cupid_df['pets'] = ok_cupid_df['pets'].fillna('unknown_pets')
#ok_cupid_df['pets'].value_counts()
ok_cupid_df['religion'] = ok_cupid_df['religion'].fillna('unknown_religion')
#ok_cupid_df['religion'].value_counts()
ok_cupid_df['sign'] = ok_cupid_df['sign'].fillna('unknown_sign')
#ok_cupid_df['sign'].value_counts()
ok_cupid_df['smokes'] = ok_cupid_df['smokes'].fillna('unknown_smokes')
ok_cupid_df['smokes'].value_counts()
no 43896
unknown_smokes 5512
sometimes 3787
when drinking 3040
yes 2231
trying to quit 1480
Name: smokes, dtype: int64
ok_cupid_df['drinks'] = ok_cupid_df['drinks'].fillna('unknown_drinks')
ok_cupid_df['drinks'].value_counts()
socially 41780
rarely 5957
often 5164
not at all 3267
unknown_drinks 2985
very often 471
desperately 322
Name: drinks, dtype: int64
ok_cupid_df['speaks'] = ok_cupid_df['speaks'].fillna('unknown_speaks')
ok_cupid_df['speaks'].value_counts()
english 21828
english (fluently) 6628
english (fluently), spanish (poorly) 2059
english (fluently), spanish (okay) 1917
english (fluently), spanish (fluently) 1288
...
english (fluently), french (poorly), polish (poorly), latin (poorly), italian (poorly) 1
english (fluently), hebrew (fluently), yiddish (fluently) 1
english (fluently), spanish (okay), catalan (poorly), italian (poorly) 1
english (fluently), c++ (fluently), bengali (okay), french (poorly) 1
english (fluently), ancient greek (okay), spanish (fluently), french (poorly), hebrew (poorly) 1
Name: speaks, Length: 7648, dtype: int64
ok_cupid_df[ok_cupid_df['height'].isna()]
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | essay0 | essay1 | essay2 | essay3 | essay4 | essay5 | essay6 | essay7 | essay8 | essay9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36428 | 32 | single | f | straight | unknown_body_type | unknowndiet | unknown_drinks | unknown_drugs | unknown_education | other | ... | NaN | NaN | NaN | NaN | thomas bernhard, foucault, annie hall, taxi dr... | NaN | consciousness | NaN | i passionately hate liars! | you know what my user name means and if you ar... |
54002 | 25 | single | m | straight | unknown_body_type | unknowndiet | unknown_drinks | never | unknown_education | hispanic / latin | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
58983 | 49 | single | m | straight | unknown_body_type | unknowndiet | unknown_drinks | unknown_drugs | unknown_education | unknown_ethnicity | ... | great guy, lots of positive attributes*, but s... | living it. quite a bit more than that - more ... | lots, notably good, deep, excellent communicat... | some positive stuff, but i'll hold my tongue o... | lots. not especially up to listing 'em here a... | 1. damn good friend, or better 2. managing to ... | many things. maybe too much. not really up for... | at the moment, i'd rather not even say or thin... | i have a blog of much that's personal and priv... | you've good reason to think we'd like make at ... |
3 rows × 31 columns
ok_cupid_df.isna().sum()
age 0
status 0
sex 0
orientation 0
body_type 0
diet 0
drinks 0
drugs 0
education 0
ethnicity 0
height 3
income 0
job 0
last_online 0
location 0
offspring 0
pets 0
religion 0
sign 0
smokes 0
speaks 0
essay0 5488
essay1 7572
essay2 9638
essay3 11476
essay4 10537
essay5 10850
essay6 13771
essay7 12451
essay8 19225
essay9 12603
dtype: int64
Check for duplicalted rows:
ok_cupid_df[ok_cupid_df.duplicated()]
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | essay0 | essay1 | essay2 | essay3 | essay4 | essay5 | essay6 | essay7 | essay8 | essay9 |
---|
0 rows × 31 columns
Transform essays:
ok_cupid_df['essay0'] = ok_cupid_df['essay0'].fillna('')
ok_cupid_df['essay1'] = ok_cupid_df['essay1'].fillna('')
ok_cupid_df['essay2'] = ok_cupid_df['essay2'].fillna('')
ok_cupid_df['essay3'] = ok_cupid_df['essay3'].fillna('')
ok_cupid_df['essay4'] = ok_cupid_df['essay4'].fillna('')
ok_cupid_df['essay5'] = ok_cupid_df['essay5'].fillna('')
ok_cupid_df['essay6'] = ok_cupid_df['essay6'].fillna('')
ok_cupid_df['essay7'] = ok_cupid_df['essay7'].fillna('')
ok_cupid_df['essay8'] = ok_cupid_df['essay8'].fillna('')
ok_cupid_df['essay9'] = ok_cupid_df['essay9'].fillna('')
essay_columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
ok_cupid_df['essay'] = ok_cupid_df[essay_columns].apply(lambda row: ' '.join(row.values.astype(object)), axis = 1)
ok_cupid_df['essay'] = ok_cupid_df['essay'].str.replace('[^\w\s]','')
avg_words = pd.DataFrame()
avg_words['avg_words'] = ok_cupid_df['essay'].str.split().str.len()
avg_words.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
avg_words | 59946.0 | 353.896757 | 293.270595 | 0.0 | 158.0 | 296.0 | 477.0 | 10486.0 |
ok_cupid_df['essay_len'] = ok_cupid_df['essay'].str.len()
ok_cupid_df = ok_cupid_df.drop(columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'])
ok_cupid_df.head(2)
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | last_online | location | offspring | pets | religion | sign | smokes | speaks | essay | essay_len | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | m | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | ... | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | sometimes | english | about me i would love to think that i was som... | 2389 |
1 | 35 | single | m | straight | average | mostly other | often | sometimes | working on space camp | white | ... | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | no | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 |
2 rows × 23 columns
ok_cupid_df.isna().sum()
age 0
status 0
sex 0
orientation 0
body_type 0
diet 0
drinks 0
drugs 0
education 0
ethnicity 0
height 3
income 0
job 0
last_online 0
location 0
offspring 0
pets 0
religion 0
sign 0
smokes 0
speaks 0
essay 0
essay_len 0
dtype: int64
Drop 3 rows with null values:
ok_cupid_df.dropna(inplace=True)
ok_cupid_df[ok_cupid_df['age']>100]
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | last_online | location | offspring | pets | religion | sign | smokes | speaks | essay | essay_len | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2512 | 110 | single | f | straight | unknown_body_type | unknowndiet | unknown_drinks | unknown_drugs | unknown_education | unknown_ethnicity | ... | 2012-06-27-22-16 | daly city, california | unknown_offspring | unknown_pets | unknown_religion | unknown_sign | unknown_smokes | english | 9 | |
25324 | 109 | available | m | straight | athletic | mostly other | unknown_drinks | never | working on masters program | unknown_ethnicity | ... | 2012-06-30-18-18 | san francisco, california | might want kids | unknown_pets | other and somewhat serious about it | aquarius but it doesn’t matter | when drinking | english (okay) | nothing | 16 |
2 rows × 23 columns
#ok_cupid_df = ok_cupid_df.drop(ok_cupid_df.index[[, 25324]])
ok_cupid_df = ok_cupid_df.drop(index = [2512, 25324])
ok_cupid_df[ok_cupid_df['age']>100]
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | last_online | location | offspring | pets | religion | sign | smokes | speaks | essay | essay_len |
---|
0 rows × 23 columns
Check duplicated columns:
ok_cupid_df.columns.duplicated()
array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False])
ok_cupid_df[ok_cupid_df.duplicated()]
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | last_online | location | offspring | pets | religion | sign | smokes | speaks | essay | essay_len |
---|
0 rows × 23 columns
ok_cupid_df['income'].value_counts()
-1 48437
20000 2952
100000 1621
80000 1111
30000 1048
40000 1005
50000 975
60000 736
70000 707
150000 631
1000000 521
250000 149
500000 48
Name: income, dtype: int64
ok_cupid_df.shape
(59941, 23)
#for column in ok_cupid_df.columns:
# plt.figure()
# plt.hist(ok_cupid_df[column], bins=25)
# plt.title(f'Histogram of {column}')
# plt.show()
Add binary column for gender (and drop existing sex column):
valid_income_df = ok_cupid_df[ok_cupid_df['income'] != -1]
male_df_income = valid_income_df[valid_income_df['sex'] == 'm']
female_df_income = valid_income_df[valid_income_df['sex'] == 'f']
ok_cupid_df2 = ok_cupid_df.copy()
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['thin'],'skinny')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['fit'],'athletic')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['unknown_body_type'],'rather not say')
# Add new binary column for gender
ok_cupid_df["male"] = np.where(ok_cupid_df["sex"]=="m", 1, 0)
ok_cupid_df.head(2)
age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | location | offspring | pets | religion | sign | smokes | speaks | essay | essay_len | male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | m | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | ... | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | sometimes | english | about me i would love to think that i was som... | 2389 | 1 |
1 | 35 | single | m | straight | average | mostly other | often | sometimes | working on space camp | white | ... | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | no | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 |
2 rows × 24 columns
# Drop the sex column
ok_cupid_df.drop(columns="sex", inplace=True)
ok_cupid_df.head(2)
age | status | orientation | body_type | diet | drinks | drugs | education | ethnicity | height | ... | location | offspring | pets | religion | sign | smokes | speaks | essay | essay_len | male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | ... | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | sometimes | english | about me i would love to think that i was som... | 2389 | 1 |
1 | 35 | single | straight | average | mostly other | often | sometimes | working on space camp | white | 70.0 | ... | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | no | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 |
2 rows × 23 columns
ok_cupid_df.reset_index(drop=True, inplace=True)
ok_cupid_df2.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
'smokes', 'speaks', 'essay', 'essay_len'],
dtype='object')
valid_income_df['education'] = valid_income_df['education'].replace({'working on space camp': 'space camp',
'working on college/university': 'college/university',
'graduated from college/university': 'college/university',
'graduated from high school': 'high school',
'dropped out of space camp': 'space camp',
'graduated from space camp': 'space camp',
'graduated from law school': 'law school',
'graduated from masters program': 'masters program',
'graduated from two-year college': 'two-year college',
'working on med school': 'med school',
'dropped out of high school': 'high school',
'working on ph.d program': 'ph.d program',
'graduated from ph.d program': 'ph.d program',
'dropped out of college/university': 'college/university',
'dropped out of two-year college': 'two-year college',
'dropped out of med school': 'med school',
'working on masters program': 'masters program',
'working on two-year college': 'two-year college',
'working on high school': 'high school',
'graduated from med school': 'med school',
'dropped out of masters program': 'masters program',
'working on law school': 'law school',
'dropped out of ph.d program': 'ph.d program',
'dropped out of law school': 'law school'})
male_df_income = valid_income_df[valid_income_df['sex'] == 'm']
female_df_income = valid_income_df[valid_income_df['sex'] == 'f']
means_male = []
medians_male = []
for education in valid_income_df['education'].unique():
means_male.append(male_df_income[male_df_income['education'] == education]['income'].mean())
medians_male.append(male_df_income[male_df_income['education'] == education]['income'].median())
list_of_tuples_male = list(zip(male_df_income['education'].unique(), means_male, medians_male))
df_mean_incomes_male = pd.DataFrame(list_of_tuples_male, columns=['education', 'mean_income', 'median_income'])
means_female = []
medians_female = []
for education in valid_income_df['education'].unique():
means_female.append(female_df_income[female_df_income['education'] == education]['income'].mean())
medians_female.append(female_df_income[female_df_income['education'] == education]['income'].median())
list_of_tuples_female = list(zip(female_df_income['education'].unique(), means_female, medians_female))
df_mean_incomes_female = pd.DataFrame(list_of_tuples_female, columns=['education', 'mean_income', 'median_income'])
df_mean_incomes_male_mean = df_mean_incomes_male.sort_values(by = 'mean_income')
df_mean_incomes_male_median = df_mean_incomes_male.sort_values(by = 'median_income')
df_mean_incomes_female_mean = df_mean_incomes_female.sort_values(by = 'mean_income')
df_mean_incomes_female_median = df_mean_incomes_female.sort_values(by = 'median_income')
#df_mean_incomes_male_mean
educations = valid_income_df['education'].unique()
fig = go.Figure(data=[
go.Bar(name='Male',
y=educations,
x=df_mean_incomes_male_median['median_income'],
text=df_mean_incomes_male_median['median_income'], orientation='h'),
go.Bar(name='Female',
y=educations,
x=df_mean_incomes_female_median['median_income'],
text=df_mean_incomes_female_median['median_income'], orientation='h')
])
fig.update_layout(barmode='group',
title='Median Income for each education',
#xaxis_tickfont_size=14,
yaxis=dict(title='Educations',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Median Income',
titlefont_size=16,
tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
fig = go.Figure()
fig.add_trace(go.Histogram(x=male_df_income['income'], name='Male'))
fig.add_trace(go.Histogram(x=female_df_income['income'], name='Female'))
fig.update_layout(barmode='overlay',
title='Distribution of Income for Males and Females',
#xaxis_tickfont_size=14,
yaxis=dict(title='Count',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Income',
titlefont_size=16,
tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()
labels = ['Male', 'Female']
values = ok_cupid_df2['sex'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=labels,
values=values,
hole=.5)])
fig.update_layout(title='Male vs Female')
fig.show()
Create new dataframes for Male, Female, and Orientations:
male_df = ok_cupid_df2[ok_cupid_df2['sex'] == 'm']
female_df = ok_cupid_df2[ok_cupid_df2['sex'] == 'f']
ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({'agnosticism and very serious about it': 'agnosticism',
'agnosticism but not too serious about it' : 'agnosticism',
'agnosticism and somewhat serious about it': 'agnosticism',
'agnosticism and laughing about it': 'agnosticism','agnosticism': 'agnostic'})
ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({'atheism and laughing about it':'atheism',
'atheism and somewhat serious about it': 'atheism',
'atheism but not too serious about it': 'atheism', 'atheism and very serious about it':'atheism', 'atheism': 'atheist'})
for religion in ok_cupid_df2['religion'].unique():
if ((religion != 'atheist') and (religion != 'agnostic') and (religion != 'unknown_religion')):
ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({religion: 'religious'})
ok_cupid_df['religion_binary'] = ok_cupid_df['religion']
ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({'agnosticism and very serious about it': 'agnostic',
'agnosticism but not too serious about it' : 'agnostic',
'agnosticism and somewhat serious about it': 'agnostic',
'agnosticism and laughing about it': 'agnosticism','agnosticism': 'agnostic'})
ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({'atheism and laughing about it':'atheism',
'atheism and somewhat serious about it': 'atheist',
'atheism but not too serious about it': 'atheist', 'atheism and very serious about it':'atheist', 'atheism': 'atheist'})
for religion in ok_cupid_df['religion_binary'].unique():
if ((religion != 'atheist') and (religion != 'agnostic') and (religion != 'unknown_religion')):
ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({religion: 'religious'})
ok_cupid_df2['religion'].value_counts()
religious 34820
unknown_religion 20222
agnostic 2724
atheist 2175
Name: religion, dtype: int64
ok_cupid_df['religion_binary'].value_counts()
religious 28492
unknown_religion 20222
agnostic 6316
atheist 4911
Name: religion_binary, dtype: int64
ok_cupid_df.head(2)
age | status | orientation | body_type | diet | drinks | drugs | education | ethnicity | height | ... | offspring | pets | religion | sign | smokes | speaks | essay | essay_len | male | religion_binary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | ... | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | sometimes | english | about me i would love to think that i was som... | 2389 | 1 | agnostic |
1 | 35 | single | straight | average | mostly other | often | sometimes | working on space camp | white | 70.0 | ... | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | no | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | agnostic |
2 rows × 24 columns
ok_cupid_df['is_religious'] = np.where(ok_cupid_df['religion_binary'] == 'religious', 1, 0)
ok_cupid_df['is_agnostic'] = np.where(ok_cupid_df['religion_binary'] == 'agnostic', 1, 0)
ok_cupid_df['is_atheist'] = np.where(ok_cupid_df['religion_binary'] == 'atheist', 1, 0)
ok_cupid_df.head(2)
age | status | orientation | body_type | diet | drinks | drugs | education | ethnicity | height | ... | sign | smokes | speaks | essay | essay_len | male | religion_binary | is_religious | is_agnostic | is_atheist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | ... | gemini | sometimes | english | about me i would love to think that i was som... | 2389 | 1 | agnostic | 0 | 1 | 0 |
1 | 35 | single | straight | average | mostly other | often | sometimes | working on space camp | white | 70.0 | ... | cancer | no | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | agnostic | 0 | 1 | 0 |
2 rows × 27 columns
straight_df = ok_cupid_df[ok_cupid_df['orientation'] == 'straight']
non_straight_df = ok_cupid_df[ok_cupid_df['orientation'].isin(['bisexual', 'gay'])]
non_straight_df.head(2)
age | status | orientation | body_type | diet | drinks | drugs | education | ethnicity | height | ... | sign | smokes | speaks | essay | essay_len | male | religion_binary | is_religious | is_agnostic | is_atheist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
37 | 25 | single | bisexual | fit | mostly anything | socially | unknown_drugs | working on college/university | hispanic / latin, white | 69.0 | ... | libra and it’s fun to think about | unknown_smokes | english (fluently), spanish (poorly) | lets go to a festival and dance all night runn... | 3516 | 1 | unknown_religion | 0 | 0 | 0 |
44 | 29 | single | bisexual | curvy | anything | socially | sometimes | graduated from masters program | white | 66.0 | ... | aquarius and it’s fun to think about | no | english, spanish (poorly), portuguese (poorly) | i am an east coast transplant looking for fun ... | 2259 | 0 | religious | 1 | 0 | 0 |
2 rows × 27 columns
ok_cupid_df['is_straight'] = np.where(ok_cupid_df['orientation'] == 'straight', 1, 0)
ok_cupid_df['is_not_straight'] = np.where(ok_cupid_df['orientation'] != 'straight', 1, 0)
ok_cupid_df.head(2)
age | status | orientation | body_type | diet | drinks | drugs | education | ethnicity | height | ... | speaks | essay | essay_len | male | religion_binary | is_religious | is_agnostic | is_atheist | is_straight | is_not_straight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | straight | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | ... | english | about me i would love to think that i was som... | 2389 | 1 | agnostic | 0 | 1 | 0 | 1 | 0 |
1 | 35 | single | straight | average | mostly other | often | sometimes | working on space camp | white | 70.0 | ... | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | agnostic | 0 | 1 | 0 | 1 | 0 |
2 rows × 29 columns
# relgns =ok_cupid_df['religion_binary'].unique()
# fig = go.Figure([go.Bar(x=relgns, y=ok_cupid_df["religion_binary"].value_counts().values, color=ok_cupid_df['is_straight'])])
# fig.show()
religious_views = ['religious', 'unknown_religion', 'agnostic', 'atheist']
fig = go.Figure(data=[
go.Bar(name='Straight',
x=religious_views,
y=straight_df['religion'].value_counts().values,
text=straight_df['religion'].value_counts().values),
go.Bar(name='Non-straight',
x=religious_views,
y=non_straight_df['religion'].value_counts().values,
text=non_straight_df['religion'].value_counts().values)
])
fig.update_layout(barmode='group',
title='Religions',
#xaxis_tickfont_size=14,
yaxis=dict(title='Counts',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Religions',
titlefont_size=16,
tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
ok_cupid_df = ok_cupid_df.drop(columns=['is_not_straight', 'religion_binary'])
ok_cupid_df2.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
'smokes', 'speaks', 'essay', 'essay_len'],
dtype='object')
ok_cupid_df2.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
'smokes', 'speaks', 'essay', 'essay_len'],
dtype='object')
fig = go.Figure()
fig.add_trace(go.Histogram(x=male_df['age'], name='Male'))
fig.add_trace(go.Histogram(x=female_df['age'], name='Female'))
fig.update_layout(barmode='overlay',
title='Distribution of Age for Males and Females',
#xaxis_tickfont_size=14,
yaxis=dict(title='Count',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Age',
titlefont_size=16,
tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()
fig = go.Figure()
fig.add_trace(go.Histogram(x=male_df['height'], name='Male'))
fig.add_trace(go.Histogram(x=female_df['height'], name='Female'))
fig.update_layout(barmode='overlay',
title='Distribution of Heights for Males and Females',
#xaxis_tickfont_size=14,
yaxis=dict(title='Count',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Height',
titlefont_size=16,
tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()
orientations = ['straight', 'bisexual', 'gay']
fig = go.Figure(data=[
go.Bar(name='Male',
x=orientations,
y=male_df['orientation'].value_counts().values,
text=male_df['orientation'].value_counts().values),
go.Bar(name='Female',
x=orientations,
y=female_df['orientation'].value_counts().values,
text=female_df['orientation'].value_counts().values)
])
fig.update_layout(barmode='group',
title='Orientations',
#xaxis_tickfont_size=14,
yaxis=dict(title='Counts',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Orientation',
titlefont_size=16,
tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
orientations = ['average', 'average', 'curvy', 'skinny', 'rather not say', 'overweight', 'jacked', 'used up']
fig = go.Figure(data=[
go.Bar(name='Male',
y=orientations,
x=male_df['body_type'].value_counts().values,
text=male_df['body_type'].value_counts().values, orientation='h'),
go.Bar(name='Female',
y=orientations,
x=female_df['body_type'].value_counts().values,
text=female_df['body_type'].value_counts().values, orientation='h')
])
fig.update_layout(barmode='group',
title='Body Types for Male and Female',
#xaxis_tickfont_size=14,
yaxis=dict(title='Body Types',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Counts',
titlefont_size=16,
tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
orientations = ['single', 'available', 'seeing someone', 'married', 'unknown_status']
fig = go.Figure(data=[
go.Bar(name='Male',
x=orientations,
y=male_df['status'].value_counts().values,
text=male_df['status'].value_counts().values),
go.Bar(name='Female',
x=orientations,
y=female_df['status'].value_counts().values,
text=female_df['status'].value_counts().values)
])
fig.update_layout(barmode='group',
title='Status',
#xaxis_tickfont_size=14,
yaxis=dict(title='Counts',
titlefont_size=16,
tickfont_size=14),
xaxis=dict(title='Status',
titlefont_size=16,
tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
Encoding:
Orientation, status, job
from sklearn.preprocessing import OneHotEncoder
# Instantiate OneHotEncoder
orientation_ohe = OneHotEncoder()
status_ohe = OneHotEncoder()
job_ohe = OneHotEncoder()
# Fit the OneHotEncoder and transform
orientation = pd.DataFrame(ok_cupid_df['orientation'])
orientation_encoded = orientation_ohe.fit_transform(orientation)
display(orientation_encoded)
status = pd.DataFrame(ok_cupid_df['status'])
status_encoded = status_ohe.fit_transform(status)
display(status_encoded)
job = pd.DataFrame(ok_cupid_df['job'])
job_encoded = job_ohe.fit_transform(job)
display(job_encoded)
<59941x3 sparse matrix of type '<class 'numpy.float64'>'
with 59941 stored elements in Compressed Sparse Row format>
<59941x5 sparse matrix of type '<class 'numpy.float64'>'
with 59941 stored elements in Compressed Sparse Row format>
<59941x22 sparse matrix of type '<class 'numpy.float64'>'
with 59941 stored elements in Compressed Sparse Row format>
# Put into a dataframe to get column names
encoded_df_orientation = pd.DataFrame(orientation_encoded.toarray().astype(int), columns=orientation_ohe.categories_[0], dtype=int)
encoded_df_orientation = encoded_df_orientation.drop(encoded_df_orientation.columns[0], axis=1)
display(encoded_df_orientation.head(2))
# Status
encoded_df_status = pd.DataFrame(status_encoded.toarray().astype(int), columns=status_ohe.categories_[0], dtype=int)
encoded_df_status = encoded_df_status.drop(columns='unknown_status')
display(encoded_df_status.head(2))
# Jobs
encoded_df_job = pd.DataFrame(job_encoded.toarray().astype(int), columns=job_ohe.categories_[0], dtype=int)
encoded_df_job = encoded_df_job.drop(columns='unknown_job')
display(encoded_df_job.head(2))
gay | straight | |
---|---|---|
0 | 0 | 1 |
1 | 0 | 1 |
available | married | seeing someone | single | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 1 |
artistic / musical / writer | banking / financial / real estate | clerical / administrative | computer / hardware / software | construction / craftsmanship | education / academia | entertainment / media | executive / management | hospitality / travel | law / legal services | ... | military | other | political / government | rather not say | retired | sales / marketing / biz dev | science / tech / engineering | student | transportation | unemployed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 21 columns
encoded_df_job.rename(columns= {'other': 'otherjob'}, inplace = True)
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_orientation], axis=1)
ok_cupid_df.drop(columns='orientation', inplace=True)
ok_cupid_df.head(2)
age | status | body_type | diet | drinks | drugs | education | ethnicity | height | income | ... | speaks | essay | essay_len | male | is_religious | is_agnostic | is_atheist | is_straight | gay | straight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | -1 | ... | english | about me i would love to think that i was som... | 2389 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
1 | 35 | single | average | mostly other | often | sometimes | working on space camp | white | 70.0 | 80000 | ... | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
2 rows × 28 columns
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_status], axis=1)
ok_cupid_df.drop(columns='status', inplace=True)
ok_cupid_df.head(2)
age | body_type | diet | drinks | drugs | education | ethnicity | height | income | job | ... | is_religious | is_agnostic | is_atheist | is_straight | gay | straight | available | married | seeing someone | single | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | -1 | transportation | ... | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
1 | 35 | average | mostly other | often | sometimes | working on space camp | white | 70.0 | 80000 | hospitality / travel | ... | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
2 rows × 31 columns
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_job], axis=1)
ok_cupid_df.drop(columns='job', inplace=True)
ok_cupid_df.head(2)
age | body_type | diet | drinks | drugs | education | ethnicity | height | income | last_online | ... | military | otherjob | political / government | rather not say | retired | sales / marketing / biz dev | science / tech / engineering | student | transportation | unemployed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | -1 | 2012-06-28-20-30 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 35 | average | mostly other | often | sometimes | working on space camp | white | 70.0 | 80000 | 2012-06-29-21-41 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 51 columns
Essays:
from sklearn.feature_extraction.text import CountVectorizer
# import the nltk stopwords
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Let's test it out
stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')
def my_tokenizer(sentence):
# remove punctuation and set to lower case
# for punctuation_mark in string.punctuation:
# sentence = sentence.replace(punctuation_mark,'').lower()
# split sentence into words
listofwords = sentence.split(' ')
listofstemmed_words = []
# remove stopwords and any tokens that are just empty strings
for word in listofwords:
if (not word in ENGLISH_STOP_WORDS) and (word!=''):
# Stem words
stemmed_word = stemmer.stem(word)
listofstemmed_words.append(stemmed_word)
return listofstemmed_words
Requirement already satisfied: nltk in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (3.6.2)
Requirement already satisfied: regex in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (2021.4.4)
Requirement already satisfied: joblib in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (1.0.1)
Requirement already satisfied: click in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (7.1.2)
Requirement already satisfied: tqdm in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (4.60.0)
[nltk_data] Downloading package stopwords to
[nltk_data] /Users/puneetsran/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
# 1. Instantiate
# essay = CountVectorizer(stop_words="english", min_df=5, max_features=100, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
essay = CountVectorizer(min_df=5, max_features=100, tokenizer = my_tokenizer)
# 2. Fit
essay.fit(ok_cupid_df["essay"])
# 3. Transform
essay_transformed = essay.transform(ok_cupid_df["essay"])
essay_transformed
<59941x100 sparse matrix of type '<class 'numpy.int64'>'
with 2153638 stored elements in Compressed Sparse Row format>
essay_df = pd.DataFrame(columns=essay.get_feature_names(), data=essay_transformed.toarray())
#essay_df = essay0_df.drop(essay_df.columns[0], axis=1)
essay_df.head(2)
adventur | also | alway | anyth | around | art | back | big | book | citi | ... | want | watch | way | well | work | world | would | write | year | your | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 3 | 0 | ... | 4 | 0 | 3 | 0 | 3 | 0 | 2 | 0 | 1 | 0 |
1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 rows × 100 columns
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, essay_df], axis=1)
ok_cupid_df.head(2)
age | body_type | diet | drinks | drugs | education | ethnicity | height | income | last_online | ... | want | watch | way | well | work | world | would | write | year | your | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | -1 | 2012-06-28-20-30 | ... | 4 | 0 | 3 | 0 | 3 | 0 | 2 | 0 | 1 | 0 |
1 | 35 | average | mostly other | often | sometimes | working on space camp | white | 70.0 | 80000 | 2012-06-29-21-41 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 rows × 151 columns
# 1. Instantiate
ethnicity = CountVectorizer()
# 2. Fit
ethnicity.fit(ok_cupid_df["ethnicity"])
# 3. Transform
ethnicity_transformed = ethnicity.transform(ok_cupid_df["ethnicity"])
ethnicity_transformed
<59941x14 sparse matrix of type '<class 'numpy.int64'>'
with 78195 stored elements in Compressed Sparse Row format>
ethnicity_transformed.toarray()
array([[0, 1, 0, ..., 0, 0, 1],
[0, 0, 0, ..., 0, 0, 1],
[0, 0, 0, ..., 0, 1, 0],
...,
[0, 1, 0, ..., 0, 0, 0],
[0, 1, 1, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 1]])
ethnicity_df = pd.DataFrame(columns=ethnicity.get_feature_names(), data=ethnicity_transformed.toarray())
unknown_ethnicity = ethnicity_df[ethnicity_df['unknown_ethnicity'] == 1]['unknown_ethnicity']
unknown_ethnicity = unknown_ethnicity.sum()
# Drop one column to prevent redundant information
ethnicity_df = ethnicity_df.drop(columns='unknown_ethnicity')
ethnicity_df = ethnicity_df.rename(columns={'american': 'native_american', 'eastern': 'middle_eastern', 'islander': 'pacific_islander', 'hispanic': 'hispanic_latin'})
ethnicity_df = ethnicity_df.drop(columns=['native', 'middle', 'pacific', 'latin'])
ethnicity_df.head(2)
native_american | asian | black | middle_eastern | hispanic_latin | indian | pacific_islander | other | white | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
ethnicity_df.rename(columns = {'other':'other_ethnicity'}, inplace = True)
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, ethnicity_df], axis=1)
ok_cupid_df.head(2)
age | body_type | diet | drinks | drugs | education | ethnicity | height | income | last_online | ... | your | native_american | asian | black | middle_eastern | hispanic_latin | indian | pacific_islander | other_ethnicity | white | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | a little extra | strictly anything | socially | never | working on college/university | asian, white | 75.0 | -1 | 2012-06-28-20-30 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 35 | average | mostly other | often | sometimes | working on space camp | white | 70.0 | 80000 | 2012-06-29-21-41 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 rows × 160 columns
ethnicity_df['native_american'].value_counts()[1]
1265
native_american = ethnicity_df[ethnicity_df['native_american'] == 1]['native_american']
native_american = native_american.sum()
asian = ethnicity_df[ethnicity_df['asian'] == 1]['asian']
asian = asian.sum()
black = ethnicity_df[ethnicity_df['black'] == 1]['black']
black = black.sum()
middle_eastern = ethnicity_df[ethnicity_df['middle_eastern'] == 1]['middle_eastern']
middle_eastern = middle_eastern.sum()
hispanic_latin = ethnicity_df[ethnicity_df['hispanic_latin'] == 1]['hispanic_latin']
hispanic_latin = hispanic_latin.sum()
indian = ethnicity_df[ethnicity_df['indian'] == 1]['indian']
indian = indian.sum()
pacific_islander = ethnicity_df[ethnicity_df['pacific_islander'] == 1]['pacific_islander']
pacific_islander = pacific_islander.sum()
other_ethnicity = ethnicity_df[ethnicity_df['other_ethnicity'] == 1]['other_ethnicity']
other_ethnicity = other_ethnicity.sum()
white = ethnicity_df[ethnicity_df['white'] == 1]['white']
white = white.sum()
# intialise data of lists.
sum_ethnicities = {'ethnicity':['native american', 'asian', 'black',
'middle eastern', 'hispanic/latin',
'indian', 'pacific islander',
'other_ethnicity', 'white', 'unknown_ethnicity'],
'sum':[native_american, asian, black, middle_eastern,
hispanic_latin, indian, pacific_islander, other_ethnicity,
white, unknown_ethnicity]}
# Create DataFrame
ethnicities_sum = pd.DataFrame(sum_ethnicities)
ethnicities_sum['sum'].unique()
array([ 1265, 8205, 3328, 950, 5356, 1449, 1473, 3566, 37882,
5677])
#fig = px.bar(ethnicities_sum, x='ethnicity', y='sum')
#fig.show()
labels = ethnicities_sum['ethnicity'].unique()
values = ethnicities_sum['sum'].unique()
fig = go.Figure(data=[go.Pie(labels=labels,
values=values,
hole=.5)])
fig.update_layout(title='Ethinicities')
fig.show()
ok_cupid_df.drop(columns='ethnicity', inplace=True)
Body type
ok_cupid_df['body_type'].value_counts()
average 14652
fit 12711
athletic 11818
unknown_body_type 5292
thin 4711
curvy 3924
a little extra 2629
skinny 1777
full figured 1009
overweight 444
jacked 421
used up 355
rather not say 198
Name: body_type, dtype: int64
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['thin'],'skinny')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['fit'],'athletic')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['unknown_body_type'],'rather not say')
ok_cupid_df['body_type'].value_counts()
athletic 24529
average 14652
curvy 7562
skinny 6488
rather not say 5490
overweight 444
jacked 421
used up 355
Name: body_type, dtype: int64
body_type_mapping = {'overweight':0, 'curvy':1, 'average':2, 'used up':3, 'rather not say': 4, 'skinny': 5, 'athletic': 6, 'jacked': 7}
body_type_mapped_data = ok_cupid_df['body_type'].map(body_type_mapping)
#ok_cupid_df = ok_cupid_df.drop(columns=['body_type'])
ok_cupid_df.drop(columns='body_type', inplace=True)
ok_cupid_df = pd.concat([ok_cupid_df, body_type_mapped_data], axis=1)
ok_cupid_df.head(2)
age | diet | drinks | drugs | education | height | income | last_online | location | offspring | ... | native_american | asian | black | middle_eastern | hispanic_latin | indian | pacific_islander | other_ethnicity | white | body_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | strictly anything | socially | never | working on college/university | 75.0 | -1 | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
1 | 35 | mostly other | often | sometimes | working on space camp | 70.0 | 80000 | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 |
2 rows × 159 columns
Drinks
ok_cupid_df['drinks'].value_counts()
socially 41780
rarely 5957
often 5164
not at all 3267
unknown_drinks 2980
very often 471
desperately 322
Name: drinks, dtype: int64
drinks_mapping = {'desperately':6, 'very often':5, 'often':4, 'unknown_drinks':3, 'socially': 2, 'rarely': 1, 'not at all': 0}
drinks_mapped_data = ok_cupid_df['drinks'].map(drinks_mapping)
ok_cupid_df = ok_cupid_df.drop(columns=['drinks'])
ok_cupid_df = pd.concat([ok_cupid_df, drinks_mapped_data], axis=1)
ok_cupid_df.head(2)
age | diet | drugs | education | height | income | last_online | location | offspring | pets | ... | asian | black | middle_eastern | hispanic_latin | indian | pacific_islander | other_ethnicity | white | body_type | drinks | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | strictly anything | never | working on college/university | 75.0 | -1 | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 |
1 | 35 | mostly other | sometimes | working on space camp | 70.0 | 80000 | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 |
2 rows × 159 columns
Diet
ok_cupid_df['diet'].value_counts()
unknowndiet 24389
mostly anything 16585
anything 6183
strictly anything 5114
mostly vegetarian 3444
mostly other 1006
strictly vegetarian 875
vegetarian 667
strictly other 452
mostly vegan 338
other 331
strictly vegan 228
vegan 137
mostly kosher 86
mostly halal 48
strictly halal 18
strictly kosher 18
halal 11
kosher 11
Name: diet, dtype: int64
#ok_cupid_df["diet"] = ok_cupid_df["diet"].replace({'anything': 'strictly anything', 'vegetarian': 'strictly vegetarian', 'other': 'strictly other',
# 'vegan': 'strictly vegan', 'kosher': 'strictly kosher', 'halal': 'strictly halal'})
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()
unknowndiet 24389
mostlyanything 16585
anything 6183
strictlyanything 5114
mostlyvegetarian 3444
mostlyother 1006
strictlyvegetarian 875
vegetarian 667
strictlyother 452
mostlyvegan 338
other 331
strictlyvegan 228
vegan 137
mostlykosher 86
mostlyhalal 48
strictlykosher 18
strictlyhalal 18
kosher 11
halal 11
Name: diet, dtype: int64
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace('other', 'otherdiet')
#ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()
unknowndiet 24389
mostlyanything 16585
anything 6183
strictlyanything 5114
mostlyvegetarian 3444
mostlyotherdiet 1006
strictlyvegetarian 875
vegetarian 667
strictlyotherdiet 452
mostlyvegan 338
otherdiet 331
strictlyvegan 228
vegan 137
mostlykosher 86
mostlyhalal 48
strictlykosher 18
strictlyhalal 18
kosher 11
halal 11
Name: diet, dtype: int64
diet = CountVectorizer()
diet_transformed = diet.fit_transform(ok_cupid_df["diet"])
diet_transformed
<59941x19 sparse matrix of type '<class 'numpy.int64'>'
with 59941 stored elements in Compressed Sparse Row format>
diet_df = pd.DataFrame(columns=diet.get_feature_names(), data=diet_transformed.toarray())
# Drop one column to prevent redundant information
diet_df = diet_df.drop(columns=['unknowndiet'])
diet_df.head(2)
anything | halal | kosher | mostlyanything | mostlyhalal | mostlykosher | mostlyotherdiet | mostlyvegan | mostlyvegetarian | otherdiet | strictlyanything | strictlyhalal | strictlykosher | strictlyotherdiet | strictlyvegan | strictlyvegetarian | vegan | vegetarian | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
diff_diets = diet_df.columns.tolist()
for d in diff_diets:
if d.startswith('mostly'):
diff_diets.remove(d)
for d in diff_diets:
if d.startswith('strictly'):
diff_diets.remove(d)
diets = diff_diets
diets_mostly = []
for diet in diets:
diets_mostly.append('mostly'+diet)
diets_strictly = []
for diet in diets:
diets_strictly.append('strictly'+diet)
for diet in diets:
if diet in diet_df.columns:
diet_df.loc[diet_df[diet] == 1, diet] = 2
for diet, diet_mostly in zip(diets, diets_mostly):
if diet_mostly in diet_df.columns:
diet_df.loc[diet_df[diet_mostly] == 1, diet] = 1
diet_df = diet_df.drop(columns=diet_mostly)
for diet, diet_strictly in zip(diets, diets_strictly):
if diet_strictly in diet_df.columns:
diet_df.loc[diet_df[diet_strictly] == 1, diet] = 3
diet_df = diet_df.drop(columns=diet_strictly)
diet_df.head(2)
anything | halal | kosher | otherdiet | vegan | vegetarian | |
---|---|---|---|---|---|---|
0 | 3 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 |
diet_df.rename(columns = {'diet':'otherdiet'}, inplace = True)
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, diet_df], axis=1)
ok_cupid_df.drop(columns='diet', inplace=True)
ok_cupid_df.head(2)
age | drugs | education | height | income | last_online | location | offspring | pets | religion | ... | other_ethnicity | white | body_type | drinks | anything | halal | kosher | otherdiet | vegan | vegetarian | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | never | working on college/university | 75.0 | -1 | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | ... | 0 | 1 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 |
1 | 35 | sometimes | working on space camp | 70.0 | 80000 | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | ... | 0 | 1 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 164 columns
Drugs
ok_cupid_df['drugs'].value_counts()
never 37722
unknown_drugs 14077
sometimes 7732
often 410
Name: drugs, dtype: int64
drugs_mapping = {'often':3, 'sometimes':2, 'unknown_drugs':1, 'never':0}
drugs_mapped_data = ok_cupid_df['drugs'].map(drugs_mapping)
drugs_mapped_data = drugs_mapped_data.astype(int)
ok_cupid_df = ok_cupid_df.drop(columns=['drugs'])
ok_cupid_df = pd.concat([ok_cupid_df, drugs_mapped_data], axis=1)
ok_cupid_df.head(2)
age | education | height | income | last_online | location | offspring | pets | religion | sign | ... | white | body_type | drinks | anything | halal | kosher | otherdiet | vegan | vegetarian | drugs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | working on college/university | 75.0 | -1 | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | ... | 1 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 35 | working on space camp | 70.0 | 80000 | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | ... | 1 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
2 rows × 164 columns
Education:
ok_cupid_df['education'].value_counts()
graduated from college/university 23959
graduated from masters program 8961
unknown_education 6624
working on college/university 5712
working on masters program 1682
graduated from two-year college 1531
graduated from high school 1428
graduated from ph.d program 1272
graduated from law school 1122
working on two-year college 1074
dropped out of college/university 995
working on ph.d program 983
college/university 801
graduated from space camp 657
dropped out of space camp 523
graduated from med school 446
working on space camp 445
working on law school 269
two-year college 222
working on med school 212
dropped out of two-year college 191
dropped out of masters program 140
masters program 136
dropped out of ph.d program 127
dropped out of high school 102
high school 96
working on high school 87
space camp 58
ph.d program 26
law school 19
dropped out of law school 18
dropped out of med school 12
med school 11
Name: education, dtype: int64
education_mapping = {'dropped out of space camp':0, 'working on space camp':1, 'graduated from space camp':2,
'space camp': 2,
'dropped out of high school':3, 'working on high school': 4, 'graduated from high school': 5,
'high school': 5,
'dropped out of two-year college': 6, 'working on two-year college': 7,
'two-year college': 8,
'graduated from two-year college': 8, 'dropped out of college/university': 9,
'college/university': 11,
'working on college/university': 10, 'graduated from college/university': 11, 'unknown_education': 12,
'dropped out of masters program': 13, 'working on masters program': 14,
'masters program': 15,
'graduated from masters program': 15, 'dropped out of law school': 16,
'working on law school': 17, 'graduated from law school': 18,
'law school': 18,
'dropped out of ph.d program': 16, 'working on ph.d program': 17,
'ph.d program': 18,
'graduated from ph.d program': 18, 'dropped out of med school': 16,
'working on med school': 17, 'graduated from med school': 18, 'med school': 18}
education_mapped_data = ok_cupid_df['education'].map(education_mapping)
education_mapped_data = education_mapped_data.astype(int)
ok_cupid_df = ok_cupid_df.drop(columns=['education'])
ok_cupid_df = pd.concat([ok_cupid_df, education_mapped_data], axis=1)
ok_cupid_df.head(2)
age | height | income | last_online | location | offspring | pets | religion | sign | smokes | ... | body_type | drinks | anything | halal | kosher | otherdiet | vegan | vegetarian | drugs | education | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | sometimes | ... | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 10 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | no | ... | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
2 rows × 164 columns
Smokes:
ok_cupid_df['smokes'].value_counts()
no 43895
unknown_smokes 5509
sometimes 3787
when drinking 3039
yes 2231
trying to quit 1480
Name: smokes, dtype: int64
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder # This is used for multiple columns
# Instantiate the label encoder
le = LabelEncoder()
# Fit and transform the order priority column
le.fit_transform(ok_cupid_df['smokes'])
array([1, 0, 0, ..., 0, 2, 1])
le.classes_
array(['no', 'sometimes', 'trying to quit', 'unknown_smokes',
'when drinking', 'yes'], dtype=object)
smokes_mapping = {'yes':4, 'sometimes':3, 'when drinking':3, 'unknown_smokes':2, 'trying to quit':1, 'no': 0}
smokes_mapped_data = ok_cupid_df['smokes'].map(smokes_mapping)
smokes_mapped_data = smokes_mapped_data.astype(int)
ok_cupid_df = ok_cupid_df.drop(columns=['smokes'])
ok_cupid_df = pd.concat([ok_cupid_df, smokes_mapped_data], axis=1)
ok_cupid_df.head(2)
age | height | income | last_online | location | offspring | pets | religion | sign | speaks | ... | drinks | anything | halal | kosher | otherdiet | vegan | vegetarian | drugs | education | smokes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | south san francisco, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | english | ... | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 3 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | oakland, california | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | english (fluently), spanish (poorly), french (... | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 |
2 rows × 164 columns
Location:
ok_cupid_df['location'].str.split(',', expand=True).head(2)
0 | 1 | 2 | |
---|---|---|---|
0 | south san francisco | california | None |
1 | oakland | california | None |
ok_cupid_df['location'].str.split(',', expand=True)[1].unique()
array([' california', ' colorado', ' new york', ' oregon', ' arizona',
' hawaii', ' montana', ' wisconsin', ' virginia', ' spain',
' nevada', ' illinois', ' vietnam', ' ireland', ' louisiana',
' michigan', ' texas', ' united kingdom', ' massachusetts',
' north carolina', ' idaho', ' mississippi', ' new jersey',
' florida', ' minnesota', ' georgia', ' utah', ' washington',
' west virginia', ' connecticut', ' tennessee', ' rhode island',
' district of columbia', ' british columbia', ' missouri',
' germany', ' pennsylvania', ' netherlands', ' switzerland',
' ohio'], dtype=object)
ok_cupid_df['city'] = ok_cupid_df['location'].str.split(',', expand=True)[0]
ok_cupid_df_copy_city = ok_cupid_df.copy()
city_data = {'city':ok_cupid_df_copy_city['city'].unique()}
# Convert the dictionary into DataFrame
cities_df = pd.DataFrame(city_data)
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
longitude = []
latitude = []
def findGeocode(city):
try:
geolocator = Nominatim(user_agent="your_app_name")
return geolocator.geocode(city)
except GeocoderTimedOut:
return findGeocode(city)
for i in (cities_df["city"]):
if findGeocode(i) != None:
loc = findGeocode(i)
latitude.append(loc.latitude)
longitude.append(loc.longitude)
else:
latitude.append(np.nan)
longitude.append(np.nan)
cities_df["longitude"] = longitude
cities_df["latitude"] = latitude
cities_df.head(2)
city | longitude | latitude | |
---|---|---|---|
0 | south san francisco | -122.416866 | 37.653540 |
1 | oakland | -122.271356 | 37.804456 |
#cities_df.rename(columns = {'City':'city'}, inplace = True)
df_cities = ok_cupid_df_copy_city['city'].value_counts().rename_axis('city').reset_index(name='counts')
df_cities.head(2)
city | counts | |
---|---|---|
0 | san francisco | 31063 |
1 | oakland | 7214 |
df_cities = pd.merge(cities_df, df_cities, on='city', how='inner')
df_cities.head(2)
city | longitude | latitude | counts | |
---|---|---|---|---|
0 | south san francisco | -122.416866 | 37.653540 | 416 |
1 | oakland | -122.271356 | 37.804456 | 7214 |
fig = px.scatter_mapbox(df_cities, lat="latitude", lon="longitude",
size='counts',
color='city',
color_continuous_scale=px.colors.cyclical.IceFire,
#size_max=15, zoom=10,
mapbox_style="carto-positron"
)
fig.show()
ok_cupid_df['state'] = ok_cupid_df['location'].str.split(',', expand=True)[1]
ok_cupid_df['state'] = ok_cupid_df['state'].str.lstrip(' ')
ok_cupid_df['state'].unique()
array(['california', 'colorado', 'new york', 'oregon', 'arizona',
'hawaii', 'montana', 'wisconsin', 'virginia', 'spain', 'nevada',
'illinois', 'vietnam', 'ireland', 'louisiana', 'michigan', 'texas',
'united kingdom', 'massachusetts', 'north carolina', 'idaho',
'mississippi', 'new jersey', 'florida', 'minnesota', 'georgia',
'utah', 'washington', 'west virginia', 'connecticut', 'tennessee',
'rhode island', 'district of columbia', 'british columbia',
'missouri', 'germany', 'pennsylvania', 'netherlands',
'switzerland', 'ohio'], dtype=object)
ok_cupid_df['country'] = 'united states'
ok_cupid_df.loc[ok_cupid_df['state'] == 'spain', 'state'] = 'madrid'
ok_cupid_df.loc[ok_cupid_df['city'] == 'madrid', 'country'] = 'spain'
ok_cupid_df.loc[ok_cupid_df['state'] == 'vietnam', 'state'] = 'khanh hoa'
ok_cupid_df.loc[ok_cupid_df['city'] == 'nha trang', 'country'] = 'vietnam'
ok_cupid_df.loc[ok_cupid_df['state'] == 'ireland', 'state'] = 'munster'
ok_cupid_df.loc[ok_cupid_df['city'] == 'cork', 'country'] = 'ireland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'state'] = 'scotland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'country'] = 'united kingdom'
ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'state'] = 'england'
ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'country'] = 'united kingdom'
ok_cupid_df.loc[ok_cupid_df['state'] == 'germany', 'state'] = 'hessen'
ok_cupid_df.loc[ok_cupid_df['city'] == 'kassel', 'country'] = 'germany'
ok_cupid_df.loc[ok_cupid_df['state'] == 'netherlands', 'state'] = 'north holland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'amsterdam', 'country'] = 'netherlands'
ok_cupid_df.loc[ok_cupid_df['state'] == 'switzerland', 'state'] = 'graubunden'
ok_cupid_df.loc[ok_cupid_df['city'] == 'bonaduz', 'country'] = 'switzerland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'vancouver', 'country'] = 'canada'
ok_cupid_df = ok_cupid_df.drop(columns=['location'])
ok_cupid_df[ok_cupid_df['country'] == 'canada']
age | height | income | last_online | offspring | pets | religion | sign | speaks | essay | ... | kosher | otherdiet | vegan | vegetarian | drugs | education | smokes | city | state | country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42435 | 32 | 63.0 | 60000 | 2012-06-28-18-38 | doesn't have kids | unknown_pets | other | aquarius | english (fluently), chinese (poorly), french (... | im happiest when wearing sunglasses and flipfl... | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | vancouver | british columbia | canada |
1 rows × 166 columns
ok_cupid_df['state'].unique()
array(['california', 'colorado', 'new york', 'oregon', 'arizona',
'hawaii', 'montana', 'wisconsin', 'virginia', 'madrid', 'nevada',
'illinois', 'khanh hoa', 'munster', 'louisiana', 'michigan',
'texas', 'scotland', 'england', 'massachusetts', 'north carolina',
'idaho', 'mississippi', 'new jersey', 'florida', 'minnesota',
'georgia', 'utah', 'washington', 'west virginia', 'connecticut',
'tennessee', 'rhode island', 'district of columbia',
'british columbia', 'missouri', 'hessen', 'pennsylvania',
'north holland', 'graubunden', 'ohio'], dtype=object)
ok_cupid_df['country'].unique()
array(['united states', 'spain', 'vietnam', 'ireland', 'united kingdom',
'canada', 'germany', 'netherlands', 'switzerland'], dtype=object)
# Instantiate the OneHotEncoder
city_ohe = OneHotEncoder()
state_ohe = OneHotEncoder()
country_ohe = OneHotEncoder()
# Fit the OneHotEncoder to the subcategory column and transform
# Expects a 2D array
city = pd.DataFrame(ok_cupid_df['city'])
city_encoded = city_ohe.fit_transform(city)
display(city_encoded)
state = pd.DataFrame(ok_cupid_df['state'])
state_encoded = state_ohe.fit_transform(state)
display(state_encoded)
country = pd.DataFrame(ok_cupid_df['country'])
country_encoded = country_ohe.fit_transform(country)
display(country_encoded)
<59941x197 sparse matrix of type '<class 'numpy.float64'>'
with 59941 stored elements in Compressed Sparse Row format>
<59941x41 sparse matrix of type '<class 'numpy.float64'>'
with 59941 stored elements in Compressed Sparse Row format>
<59941x9 sparse matrix of type '<class 'numpy.float64'>'
with 59941 stored elements in Compressed Sparse Row format>
# Put into a dataframe to get column names
encoded_df_city = pd.DataFrame(city_encoded.toarray().astype(int), columns=city_ohe.categories_[0], dtype=int)
encoded_df_city = encoded_df_city.drop(encoded_df_city.columns[0], axis=1)
display(encoded_df_city.head(2))
# Status
encoded_df_state = pd.DataFrame(state_encoded.toarray().astype(int), columns=state_ohe.categories_[0], dtype=int)
encoded_df_state = encoded_df_state.drop(encoded_df_state.columns[0], axis=1)
display(encoded_df_state.head(2))
# Jobs
encoded_df_country = pd.DataFrame(country_encoded.toarray().astype(int), columns=country_ohe.categories_[0], dtype=int)
encoded_df_country = encoded_df_country.drop(encoded_df_country.columns[0], axis=1)
display(encoded_df_country.head(2))
albany | amsterdam | arcadia | asheville | ashland | astoria | atherton | atlanta | austin | bayshore | ... | vallejo | vancouver | walnut creek | washington | waterford | west oakland | westlake | woodacre | woodbridge | woodside | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 196 columns
british columbia | california | colorado | connecticut | district of columbia | england | florida | georgia | graubunden | hawaii | ... | pennsylvania | rhode island | scotland | tennessee | texas | utah | virginia | washington | west virginia | wisconsin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 40 columns
germany | ireland | netherlands | spain | switzerland | united kingdom | united states | vietnam | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_city], axis=1)
ok_cupid_df.drop(columns='city', inplace=True)
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_state], axis=1)
ok_cupid_df.drop(columns='state', inplace=True)
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_country], axis=1)
ok_cupid_df.drop(columns='country', inplace=True)
ok_cupid_df.head(2)
age | height | income | last_online | offspring | pets | religion | sign | speaks | essay | ... | west virginia | wisconsin | germany | ireland | netherlands | spain | switzerland | united kingdom | united states | vietnam | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism and very serious about it | gemini | english | about me i would love to think that i was som... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | doesn't have kids, but might want them | likes dogs and likes cats | agnosticism but not too serious about it | cancer | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 rows × 407 columns
Offspring:
ok_cupid_df['offspring'].value_counts()
unknown_offspring 35558
doesn't have kids 7559
doesn't have kids, but might want them 3875
doesn't have kids, but wants them 3565
doesn't want kids 2927
has kids 1883
has a kid 1881
doesn't have kids, and doesn't want any 1132
has kids, but doesn't want more 442
has a kid, but doesn't want more 275
has a kid, and might want more 231
wants kids 225
might want kids 181
has kids, and might want more 115
has a kid, and wants more 71
has kids, and wants more 21
Name: offspring, dtype: int64
ok_cupid_df['offspring'] = ok_cupid_df['offspring'].replace({"doesn't have kids": "doesnt_have_kids",
"doesn't have kids, but might want them": "doesnt_have_kids might_want_kids",
"doesn't have kids, but wants them": "doesnt_have_kids wants_kids",
"doesn't want kids": "doesnt_want_kids",
"has kids": "has_kids",
"has a kid": "has_a_kid",
"doesn't have kids, and doesn't want any": "doesnt_have_kids doesnt_want_kids",
"has kids, but doesn't want more": "has_kids doesnt_want_kids",
"has a kid, but doesn't want more": "has_a_kid doesnt_want_kids",
"has a kid, and might want more": "has_a_kid might_want_kids",
"wants kids": "wants_kids",
"might want kids": "might_want_kids",
"has kids, and might want more": "has_kids might_want_kids",
"has a kid, and wants more": "has_a_kid wants_kids",
"has kids, and wants more": "has_kids wants_kids"})
ok_cupid_df['offspring'].value_counts()
unknown_offspring 35558
doesnt_have_kids 7559
doesnt_have_kids might_want_kids 3875
doesnt_have_kids wants_kids 3565
doesnt_want_kids 2927
has_kids 1883
has_a_kid 1881
doesnt_have_kids doesnt_want_kids 1132
has_kids doesnt_want_kids 442
has_a_kid doesnt_want_kids 275
has_a_kid might_want_kids 231
wants_kids 225
might_want_kids 181
has_kids might_want_kids 115
has_a_kid wants_kids 71
has_kids wants_kids 21
Name: offspring, dtype: int64
# 1. Instantiate
offspring = CountVectorizer()
# 2. Fit
offspring.fit(ok_cupid_df["offspring"])
# 3. Transform
offspring_transformed = offspring.transform(ok_cupid_df["offspring"])
offspring_transformed
<59941x7 sparse matrix of type '<class 'numpy.int64'>'
with 69668 stored elements in Compressed Sparse Row format>
offspring_df = pd.DataFrame(columns=offspring.get_feature_names(), data=offspring_transformed.toarray())
offspring_df_copy = offspring_df.copy()
# Drop one column to prevent redundant information
offspring_df = offspring_df.drop(columns='unknown_offspring')
offspring_df.head(2)
doesnt_have_kids | doesnt_want_kids | has_a_kid | has_kids | might_want_kids | wants_kids | |
---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 0 | 0 | 1 | 0 |
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, offspring_df], axis=1)
ok_cupid_df.drop(columns='offspring', inplace=True)
ok_cupid_df.head(2)
age | height | income | last_online | pets | religion | sign | speaks | essay | essay_len | ... | switzerland | united kingdom | united states | vietnam | doesnt_have_kids | doesnt_want_kids | has_a_kid | has_kids | might_want_kids | wants_kids | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | likes dogs and likes cats | agnosticism and very serious about it | gemini | english | about me i would love to think that i was som... | 2389 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | likes dogs and likes cats | agnosticism but not too serious about it | cancer | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
2 rows × 412 columns
Pets:
ok_cupid_df['pets'].value_counts()
unknown_pets 19917
likes dogs and likes cats 14814
likes dogs 7224
likes dogs and has cats 4313
has dogs 4134
has dogs and likes cats 2333
likes dogs and dislikes cats 2029
has dogs and has cats 1474
has cats 1406
likes cats 1062
has dogs and dislikes cats 552
dislikes dogs and likes cats 240
dislikes dogs and dislikes cats 196
dislikes cats 122
dislikes dogs and has cats 81
dislikes dogs 44
Name: pets, dtype: int64
ok_cupid_df['pets'] = ok_cupid_df['pets'].replace({'likes dogs and likes cats': 'likes_dogs likes_cats',
'likes dogs': 'likes_dogs',
'likes dogs and has cats': 'likes_dogs has_cats',
'has dogs': 'has_dogs',
'has dogs and likes cats': 'has_dogs likes_cats',
'likes dogs and dislikes cats': 'likes_dogs dislikes_cats',
'has dogs and has cats': 'has_dogs has_cats',
'has cats': 'has_cats',
'likes cats': 'likes_cats',
'has dogs and dislikes cats': 'has_dogs dislikes_cats',
'dislikes dogs and likes cats': 'dislikes_dogs likes_cats',
'dislikes dogs and dislikes cats': 'dislikes_dogs dislikes_cats',
'dislikes cats': 'dislikes_cats',
'dislikes dogs and has cats': 'dislikes_dogs has_cats',
'dislikes dogs': 'dislikes_dogs'})
ok_cupid_df['pets'].value_counts()
unknown_pets 19917
likes_dogs likes_cats 14814
likes_dogs 7224
likes_dogs has_cats 4313
has_dogs 4134
has_dogs likes_cats 2333
likes_dogs dislikes_cats 2029
has_dogs has_cats 1474
has_cats 1406
likes_cats 1062
has_dogs dislikes_cats 552
dislikes_dogs likes_cats 240
dislikes_dogs dislikes_cats 196
dislikes_cats 122
dislikes_dogs has_cats 81
dislikes_dogs 44
Name: pets, dtype: int64
# 1. Instantiate
pets = CountVectorizer()
# 2. Fit
pets.fit(ok_cupid_df["pets"])
# 3. Transform
pets_transformed = pets.transform(ok_cupid_df["pets"])
pets_transformed
<59941x7 sparse matrix of type '<class 'numpy.int64'>'
with 85973 stored elements in Compressed Sparse Row format>
pets_df = pd.DataFrame(columns=pets.get_feature_names(), data=pets_transformed.toarray())
pets_df_copy = pets_df.copy()
# Drop one column to prevent redundant information
pets_df = pets_df.drop(columns='unknown_pets')
pets_df.head(2)
dislikes_cats | dislikes_dogs | has_cats | has_dogs | likes_cats | likes_dogs | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 1 | 1 |
1 | 0 | 0 | 0 | 0 | 1 | 1 |
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, pets_df], axis=1)
ok_cupid_df.drop(columns='pets', inplace=True)
ok_cupid_df.head(2)
age | height | income | last_online | religion | sign | speaks | essay | essay_len | male | ... | has_a_kid | has_kids | might_want_kids | wants_kids | dislikes_cats | dislikes_dogs | has_cats | has_dogs | likes_cats | likes_dogs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | agnosticism and very serious about it | gemini | english | about me i would love to think that i was som... | 2389 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | agnosticism but not too serious about it | cancer | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
2 rows × 417 columns
Religion:
ok_cupid_df['religion'].value_counts()
unknown_religion 20222
agnosticism 2724
other 2691
agnosticism but not too serious about it 2636
agnosticism and laughing about it 2496
catholicism but not too serious about it 2318
atheism 2175
other and laughing about it 2119
atheism and laughing about it 2074
christianity 1957
christianity but not too serious about it 1952
other but not too serious about it 1554
judaism but not too serious about it 1517
atheism but not too serious about it 1318
catholicism 1064
christianity and somewhat serious about it 927
atheism and somewhat serious about it 848
other and somewhat serious about it 845
catholicism and laughing about it 726
judaism and laughing about it 681
buddhism but not too serious about it 650
agnosticism and somewhat serious about it 642
judaism 612
christianity and very serious about it 578
atheism and very serious about it 570
catholicism and somewhat serious about it 548
other and very serious about it 533
buddhism and laughing about it 466
buddhism 403
christianity and laughing about it 373
buddhism and somewhat serious about it 359
agnosticism and very serious about it 314
judaism and somewhat serious about it 266
hinduism but not too serious about it 227
hinduism 107
catholicism and very serious about it 102
buddhism and very serious about it 70
hinduism and somewhat serious about it 58
islam 48
hinduism and laughing about it 44
islam but not too serious about it 40
islam and somewhat serious about it 22
judaism and very serious about it 22
islam and laughing about it 16
hinduism and very serious about it 14
islam and very serious about it 13
Name: religion, dtype: int64
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace(' ', '')
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace('other', 'otherreligion')
ok_cupid_df['religion'].value_counts()
unknown_religion 20222
agnosticism 2724
otherreligion 2691
agnosticismbutnottooseriousaboutit 2636
agnosticismandlaughingaboutit 2496
catholicismbutnottooseriousaboutit 2318
atheism 2175
otherreligionandlaughingaboutit 2119
atheismandlaughingaboutit 2074
christianity 1957
christianitybutnottooseriousaboutit 1952
otherreligionbutnottooseriousaboutit 1554
judaismbutnottooseriousaboutit 1517
atheismbutnottooseriousaboutit 1318
catholicism 1064
christianityandsomewhatseriousaboutit 927
atheismandsomewhatseriousaboutit 848
otherreligionandsomewhatseriousaboutit 845
catholicismandlaughingaboutit 726
judaismandlaughingaboutit 681
buddhismbutnottooseriousaboutit 650
agnosticismandsomewhatseriousaboutit 642
judaism 612
christianityandveryseriousaboutit 578
atheismandveryseriousaboutit 570
catholicismandsomewhatseriousaboutit 548
otherreligionandveryseriousaboutit 533
buddhismandlaughingaboutit 466
buddhism 403
christianityandlaughingaboutit 373
buddhismandsomewhatseriousaboutit 359
agnosticismandveryseriousaboutit 314
judaismandsomewhatseriousaboutit 266
hinduismbutnottooseriousaboutit 227
hinduism 107
catholicismandveryseriousaboutit 102
buddhismandveryseriousaboutit 70
hinduismandsomewhatseriousaboutit 58
islam 48
hinduismandlaughingaboutit 44
islambutnottooseriousaboutit 40
islamandsomewhatseriousaboutit 22
judaismandveryseriousaboutit 22
islamandlaughingaboutit 16
hinduismandveryseriousaboutit 14
islamandveryseriousaboutit 13
Name: religion, dtype: int64
religion = CountVectorizer()
religion_transformed = religion.fit_transform(ok_cupid_df["religion"])
religion_transformed
<59941x46 sparse matrix of type '<class 'numpy.int64'>'
with 59941 stored elements in Compressed Sparse Row format>
religion_df = pd.DataFrame(columns=religion.get_feature_names(), data=religion_transformed.toarray())
# Drop one column to prevent redundant information
religion_df = religion_df.drop(columns=['unknown_religion'])
religion_df.head(2)
agnosticism | agnosticismandlaughingaboutit | agnosticismandsomewhatseriousaboutit | agnosticismandveryseriousaboutit | agnosticismbutnottooseriousaboutit | atheism | atheismandlaughingaboutit | atheismandsomewhatseriousaboutit | atheismandveryseriousaboutit | atheismbutnottooseriousaboutit | ... | judaism | judaismandlaughingaboutit | judaismandsomewhatseriousaboutit | judaismandveryseriousaboutit | judaismbutnottooseriousaboutit | otherreligion | otherreligionandlaughingaboutit | otherreligionandsomewhatseriousaboutit | otherreligionandveryseriousaboutit | otherreligionbutnottooseriousaboutit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 45 columns
rel = religion_df.columns.tolist()
for r in rel:
if r.endswith('andveryseriousaboutit'):
rel.remove(r)
for r in rel:
if r.endswith('andlaughingaboutit'):
rel.remove(r)
for r in rel:
if r.endswith('andsomewhatseriousaboutit'):
rel.remove(r)
for r in rel:
if r.endswith('butnottooseriousaboutit'):
rel.remove(r)
religions = rel
religions_serious = []
for religion in religions:
religions_serious.append(religion+'andveryseriousaboutit')
religions_laughing = []
for religion in religions:
religions_laughing.append(religion+'andlaughingaboutit')
religions_somewhat = []
for religion in religions:
religions_somewhat.append(religion+'andsomewhatseriousaboutit')
religions_not_serious = []
for religion in religions:
religions_not_serious.append(religion+'butnottooseriousaboutit')
for religion in religions:
if religion in religion_df.columns:
religion_df.loc[religion_df[religion] == 1, religion] = 4
for religion, religion_serious in zip(religions, religions_serious):
if religion_serious in religion_df.columns:
religion_df.loc[religion_df[religion_serious] == 1, religion] = 4
religion_df = religion_df.drop(columns=religion_serious)
for religion, religion_laughing in zip(religions, religions_laughing):
if religion_laughing in religion_df.columns:
religion_df.loc[religion_df[religion_laughing] == 1, religion] = 1
religion_df = religion_df.drop(columns=religion_laughing)
for religion, religion_somewhat in zip(religions, religions_somewhat):
if religion_somewhat in religion_df.columns:
religion_df.loc[religion_df[religion_somewhat] == 1, religion] = 3
religion_df = religion_df.drop(columns=religion_somewhat)
for religion, religion_not_serious in zip(religions, religions_not_serious):
if religion_not_serious in religion_df.columns:
religion_df.loc[religion_df[religion_not_serious] == 1, religion] = 2
religion_df = religion_df.drop(columns=religion_not_serious)
religion_df.head(2)
agnosticism | atheism | buddhism | catholicism | christianity | hinduism | islam | judaism | otherreligion | |
---|---|---|---|---|---|---|---|---|---|
0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, religion_df], axis=1)
ok_cupid_df.drop(columns='religion', inplace=True)
ok_cupid_df.head(2)
age | height | income | last_online | sign | speaks | essay | essay_len | male | is_religious | ... | likes_dogs | agnosticism | atheism | buddhism | catholicism | christianity | hinduism | islam | judaism | otherreligion | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | gemini | english | about me i would love to think that i was som... | 2389 | 1 | 0 | ... | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | cancer | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | 0 | ... | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 425 columns
Sign:
ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace('&', '').str.replace('rsquo;','')
ok_cupid_df['sign'].value_counts()
unknown_sign 11053
gemini and its fun to think about 1782
scorpio and its fun to think about 1772
leo and its fun to think about 1692
libra and its fun to think about 1649
taurus and its fun to think about 1640
cancer and its fun to think about 1597
pisces and its fun to think about 1592
sagittarius and its fun to think about 1583
virgo and its fun to think about 1574
aries and its fun to think about 1573
aquarius and its fun to think about 1503
virgo but it doesnt matter 1497
leo but it doesnt matter 1457
cancer but it doesnt matter 1454
gemini but it doesnt matter 1453
taurus but it doesnt matter 1450
libra but it doesnt matter 1408
aquarius but it doesnt matter 1407
capricorn and its fun to think about 1376
sagittarius but it doesnt matter 1375
aries but it doesnt matter 1373
capricorn but it doesnt matter 1319
pisces but it doesnt matter 1300
scorpio but it doesnt matter 1264
leo 1159
libra 1098
cancer 1092
virgo 1029
scorpio 1020
gemini 1013
taurus 1001
aries 995
pisces 992
aquarius 954
sagittarius 937
capricorn 833
scorpio and it matters a lot 78
leo and it matters a lot 66
cancer and it matters a lot 63
aquarius and it matters a lot 63
pisces and it matters a lot 62
gemini and it matters a lot 62
libra and it matters a lot 52
taurus and it matters a lot 49
aries and it matters a lot 47
sagittarius and it matters a lot 47
capricorn and it matters a lot 45
virgo and it matters a lot 41
Name: sign, dtype: int64
ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace(' ', '')
ok_cupid_df['sign'].value_counts()
unknown_sign 11053
geminianditsfuntothinkabout 1782
scorpioanditsfuntothinkabout 1772
leoanditsfuntothinkabout 1692
libraanditsfuntothinkabout 1649
taurusanditsfuntothinkabout 1640
canceranditsfuntothinkabout 1597
piscesanditsfuntothinkabout 1592
sagittariusanditsfuntothinkabout 1583
virgoanditsfuntothinkabout 1574
ariesanditsfuntothinkabout 1573
aquariusanditsfuntothinkabout 1503
virgobutitdoesntmatter 1497
leobutitdoesntmatter 1457
cancerbutitdoesntmatter 1454
geminibutitdoesntmatter 1453
taurusbutitdoesntmatter 1450
librabutitdoesntmatter 1408
aquariusbutitdoesntmatter 1407
capricornanditsfuntothinkabout 1376
sagittariusbutitdoesntmatter 1375
ariesbutitdoesntmatter 1373
capricornbutitdoesntmatter 1319
piscesbutitdoesntmatter 1300
scorpiobutitdoesntmatter 1264
leo 1159
libra 1098
cancer 1092
virgo 1029
scorpio 1020
gemini 1013
taurus 1001
aries 995
pisces 992
aquarius 954
sagittarius 937
capricorn 833
scorpioanditmattersalot 78
leoanditmattersalot 66
canceranditmattersalot 63
aquariusanditmattersalot 63
piscesanditmattersalot 62
geminianditmattersalot 62
libraanditmattersalot 52
taurusanditmattersalot 49
ariesanditmattersalot 47
sagittariusanditmattersalot 47
capricornanditmattersalot 45
virgoanditmattersalot 41
Name: sign, dtype: int64
sign = CountVectorizer()
sign_transformed = sign.fit_transform(ok_cupid_df["sign"])
sign_transformed
<59941x49 sparse matrix of type '<class 'numpy.int64'>'
with 59941 stored elements in Compressed Sparse Row format>
sign_df = pd.DataFrame(columns=sign.get_feature_names(), data=sign_transformed.toarray())
# Drop one column to prevent redundant information
sign_df = sign_df.drop(columns=['unknown_sign'])
sign_df.head(2)
aquarius | aquariusanditmattersalot | aquariusanditsfuntothinkabout | aquariusbutitdoesntmatter | aries | ariesanditmattersalot | ariesanditsfuntothinkabout | ariesbutitdoesntmatter | cancer | canceranditmattersalot | ... | scorpioanditsfuntothinkabout | scorpiobutitdoesntmatter | taurus | taurusanditmattersalot | taurusanditsfuntothinkabout | taurusbutitdoesntmatter | virgo | virgoanditmattersalot | virgoanditsfuntothinkabout | virgobutitdoesntmatter | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 48 columns
sig = sign_df.columns.tolist()
for s in sig:
if s.endswith('butitdoesntmatter'):
sig.remove(s)
for s in sig:
if s.endswith('anditmattersalot'):
sig.remove(s)
for s in sig:
if s.endswith('anditsfuntothinkabout'):
sig.remove(s)
signs = sig
signs_doesntmatter = []
for sign in signs:
signs_doesntmatter.append(sign+'butitdoesntmatter')
signs_matters = []
for sign in signs:
signs_matters.append(sign+'anditmattersalot')
signs_fun = []
for sign in signs:
signs_fun.append(sign+'anditsfuntothinkabout')
for sign in signs:
if sign in sign_df.columns:
sign_df.loc[sign_df[sign] == 1, sign] = 3
for sign, sign_doesntmatter in zip(signs, signs_doesntmatter):
if sign_doesntmatter in sign_df.columns:
sign_df.loc[sign_df[sign_doesntmatter] == 1, sign] = 2
sign_df = sign_df.drop(columns=sign_doesntmatter)
for sign, sign_matters in zip(signs, signs_matters):
if sign_matters in sign_df.columns:
sign_df.loc[sign_df[sign_matters] == 1, sign] = 3
sign_df = sign_df.drop(columns=sign_matters)
for sign, sign_fun in zip(signs, signs_fun):
if sign_fun in sign_df.columns:
sign_df.loc[sign_df[sign_fun] == 1, sign] = 1
sign_df = sign_df.drop(columns=sign_fun)
sign_df.head(2)
aquarius | aries | cancer | capricorn | gemini | leo | libra | pisces | sagittarius | scorpio | taurus | virgo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, sign_df], axis=1)
ok_cupid_df.drop(columns='sign', inplace=True)
ok_cupid_df.head(2)
age | height | income | last_online | speaks | essay | essay_len | male | is_religious | is_agnostic | ... | cancer | capricorn | gemini | leo | libra | pisces | sagittarius | scorpio | taurus | virgo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | english | about me i would love to think that i was som... | 2389 | 1 | 0 | 1 | ... | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | english (fluently), spanish (poorly), french (... | i am a chef this is what that means 1 i am a w... | 1340 | 1 | 0 | 1 | ... | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 436 columns
Speaks (languages):
ok_cupid_df['speaks'] = ok_cupid_df['speaks'].str.replace(' ', '').str.replace('(', '_').str.replace(')', '').str.replace(',', ' ').str.replace('_', '')
ok_cupid_df['speaks'].value_counts()
english 21826
englishfluently 6627
englishfluently spanishpoorly 2059
englishfluently spanishokay 1917
englishfluently spanishfluently 1288
...
englishfluently norwegianfluently swedishokay germanokay 1
english spanish portuguese thai 1
englishfluently frenchokay bulgarianpoorly chechenpoorly chinesepoorly 1
englishfluently chinesepoorly vietnamesepoorly japanesepoorly spanishokay 1
englishfluently chinesefluently japanesepoorly c++fluently otherfluently 1
Name: speaks, Length: 7648, dtype: int64
speaks = CountVectorizer()
speaks_transformed = speaks.fit_transform(ok_cupid_df["speaks"])
speaks_transformed
<59941x302 sparse matrix of type '<class 'numpy.int64'>'
with 110527 stored elements in Compressed Sparse Row format>
speaks_df = pd.DataFrame(columns=speaks.get_feature_names(), data=speaks_transformed.toarray())
# Drop one column to prevent redundant information
speaks_df = speaks_df.drop(columns=['unknownspeaks', 'poorly', 'fluently', 'okay', 'lisp', 'lispokay', 'lisppoorly', 'lispfluently'])
speaks_df.head(2)
afrikaans | afrikaansfluently | afrikaansokay | afrikaanspoorly | albanian | albanianfluently | albanianokay | albanianpoorly | ancientgreek | ancientgreekfluently | ... | vietnameseokay | vietnamesepoorly | welsh | welshfluently | welshokay | welshpoorly | yiddish | yiddishfluently | yiddishokay | yiddishpoorly | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 294 columns
speaks_df = speaks_df.rename(columns={'armenianfluently': 'armenian', 'slovenianfluently': 'slovenian', 'sardinianfluently': 'sardinian'})
lang = speaks_df.columns.tolist()
for l in lang:
if l.endswith('fluently'):
lang.remove(l)
for l in lang:
if l.endswith('poorly'):
lang.remove(l)
for l in lang:
if l.endswith('okay'):
lang.remove(l)
languages = lang
languages_fluently = []
for language in languages:
languages_fluently.append(language+'fluently')
languages_okay = []
for language in languages:
languages_okay.append(language+'okay')
languages_poorly = []
for language in languages:
languages_poorly.append(language+'poorly')
for language in languages:
if language in speaks_df.columns:
speaks_df.loc[speaks_df[language] == 1, language] = 3
for language, language_fluent in zip(languages, languages_fluently):
if language_fluent in speaks_df.columns:
speaks_df.loc[speaks_df[language_fluent] == 1, language] = 3
speaks_df = speaks_df.drop(columns=language_fluent)
for language, language_okay in zip(languages, languages_okay):
if language_okay in speaks_df.columns:
speaks_df.loc[speaks_df[language_okay] == 1, language] = 2
#speaks_df.loc[speaks_df['slovenian'] == 1, 'slovenian'] = 2
speaks_df = speaks_df.drop(columns=language_okay)
for language, language_poorly in zip(languages, languages_poorly):
if language_poorly in speaks_df.columns:
speaks_df.loc[speaks_df[language_poorly] == 1, language] = 1
speaks_df = speaks_df.drop(columns=language_poorly)
speaks_df.head(2)
afrikaans | albanian | ancientgreek | arabic | armenian | basque | belarusan | bengali | breton | bulgarian | ... | tagalog | tamil | thai | tibetan | turkish | ukrainian | urdu | vietnamese | welsh | yiddish | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 75 columns
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, speaks_df], axis=1)
ok_cupid_df.drop(columns='speaks', inplace=True)
ok_cupid_df.head(2)
age | height | income | last_online | essay | essay_len | male | is_religious | is_agnostic | is_atheist | ... | tagalog | tamil | thai | tibetan | turkish | ukrainian | urdu | vietnamese | welsh | yiddish | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | -1 | 2012-06-28-20-30 | about me i would love to think that i was som... | 2389 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 35 | 70.0 | 80000 | 2012-06-29-21-41 | i am a chef this is what that means 1 i am a w... | 1340 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 510 columns
Drop income:
(too many null values)
ok_cupid_df.drop(columns='income', inplace=True)
Last online:
ok_cupid_df['last_online']
0 2012-06-28-20-30
1 2012-06-29-21-41
2 2012-06-27-09-10
3 2012-06-28-14-22
4 2012-06-27-21-26
...
59936 2012-06-12-21-47
59937 2012-06-29-11-01
59938 2012-06-27-23-37
59939 2012-06-23-13-01
59940 2012-06-29-00-42
Name: last_online, Length: 59941, dtype: object
ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0].astype('int')
ok_cupid_df['last_online_month'] = ok_cupid_df['last_online'].str.split("-", expand=True)[1].astype('int')
last_online_datetime = (ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]).astype('datetime64')
ok_cupid_df['last_online_weekday'] = last_online_datetime.dt.weekday
ok_cupid_df['last_online_weekday'] = np.where(ok_cupid_df['last_online_weekday'] < 5, 1, 0)
ok_cupid_df['last_online_weekday']
0 1
1 1
2 1
3 1
4 1
..
59936 1
59937 1
59938 1
59939 0
59940 1
Name: last_online_weekday, Length: 59941, dtype: int64
#ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0]
ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]
0 2012-06-28
1 2012-06-29
2 2012-06-27
3 2012-06-28
4 2012-06-27
...
59936 2012-06-12
59937 2012-06-29
59938 2012-06-27
59939 2012-06-23
59940 2012-06-29
Length: 59941, dtype: object
ok_cupid_df.drop(columns=['last_online', 'is_agnostic'], inplace=True)
ok_cupid_df.head(2)
age | height | essay | essay_len | male | is_religious | is_atheist | is_straight | gay | straight | ... | tibetan | turkish | ukrainian | urdu | vietnamese | welsh | yiddish | last_online_year | last_online_month | last_online_weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | about me i would love to think that i was som... | 2389 | 1 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2012 | 6 | 1 |
1 | 35 | 70.0 | i am a chef this is what that means 1 i am a w... | 1340 | 1 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2012 | 6 | 1 |
2 rows × 510 columns
ok_cupid_df = ok_cupid_df.drop(columns=['essay'])
ok_cupid_df.head(2)
age | height | essay_len | male | is_religious | is_atheist | is_straight | gay | straight | available | ... | tibetan | turkish | ukrainian | urdu | vietnamese | welsh | yiddish | last_online_year | last_online_month | last_online_weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 75.0 | 2389 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2012 | 6 | 1 |
1 | 35 | 70.0 | 1340 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2012 | 6 | 1 |
2 rows × 509 columns
ok_cupid_df.isna().sum().sum()
0
ok_cupid_df.to_csv (r'data/okcupid_profiles_clean.csv', index = False, header=True)