project

Testing

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import plotly.express as px
import plotly.graph_objects as go

ok_cupid_df = pd.read_csv('data/okcupid_profiles.csv')
ok_cupid_df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')

ok_cupid_df.shape

(59946, 31)

ok_cupid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 age          59946 non-null  int64  
 status       59946 non-null  object 
 sex          59946 non-null  object 
 orientation  59946 non-null  object 
 body_type    54650 non-null  object 
 diet         35551 non-null  object 
 drinks       56961 non-null  object 
 drugs        45866 non-null  object 
 education    53318 non-null  object 
 ethnicity    54266 non-null  object 
height       59943 non-null  float64
income       59946 non-null  int64  
job          51748 non-null  object 
last_online  59946 non-null  object 
location     59946 non-null  object 
offspring    24385 non-null  object 
pets         40025 non-null  object 
religion     39720 non-null  object 
sign         48890 non-null  object 
smokes       54434 non-null  object 
speaks       59896 non-null  object 
essay0       54458 non-null  object 
essay1       52374 non-null  object 
essay2       50308 non-null  object 
essay3       48470 non-null  object 
essay4       49409 non-null  object 
essay5       49096 non-null  object 
essay6       46175 non-null  object 
essay7       47495 non-null  object 
essay8       40721 non-null  object 
essay9       47343 non-null  object 
dtypes: float64(1), int64(2), object(28)
memory usage: 14.2+ MB

ok_cupid_df.head(2)

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	essay0	essay1	essay2	essay3	essay4	essay5	essay6	essay7	essay8	essay9
0	22	single	m	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	...	about me: i would love to think that i was so...	currently working as an international agent fo...	making people laugh. ranting about a good salt...	the way i look. i am a six foot half asian, ha...	books: absurdistan, the republic, of mice and ...	food. water. cell phone. shelter.	duality and humorous things	trying to find someone to hang out with. i am ...	i am new to california and looking for someone...	you want to be swept off your feet! you are ti...
1	35	single	m	straight	average	mostly other	often	sometimes	working on space camp	white	...	i am a chef: this is what that means. 1. i am ...	dedicating everyday to being an unbelievable b...	being silly. having ridiculous amonts of fun w...	NaN	i am die hard christopher moore fan. i don't r...	delicious porkness in all of its glories. my b...	NaN	NaN	i am very open and will share just about anyth...	NaN

2 rows × 31 columns

Check null values:

ok_cupid_df.isna().sum()

age                0
status             0
sex                0
orientation        0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
pets           19921
religion       20226
sign           11056
smokes          5512
speaks            50
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

ok_cupid_df.isna().sum()/ok_cupid_df.shape[0]*100

age             0.000000
status          0.000000
sex             0.000000
orientation     0.000000
body_type       8.834618
diet           40.694959
drinks          4.979482
drugs          23.487806
education      11.056618
ethnicity       9.475194
height          0.005005
income          0.000000
job            13.675641
last_online     0.000000
location        0.000000
offspring      59.321723
pets           33.231575
religion       33.740366
sign           18.443266
smokes          9.194942
speaks          0.083408
essay0          9.154906
essay1         12.631368
essay2         16.077803
essay3         19.143896
essay4         17.577486
essay5         18.099623
essay6         22.972342
essay7         20.770360
essay8         32.070530
essay9         21.023922
dtype: float64

Fill null values:

ok_cupid_df['drugs'] = ok_cupid_df['drugs'].fillna('unknown_drugs')
ok_cupid_df['drugs'].value_counts()

never            37724
unknown_drugs    14080
sometimes         7732
often              410
Name: drugs, dtype: int64

ok_cupid_df['diet'] = ok_cupid_df['diet'].fillna('unknowndiet')
ok_cupid_df['diet'].value_counts()

unknowndiet            24395
mostly anything        16585
anything                6183
strictly anything       5113
mostly vegetarian       3444
mostly other            1007
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    136
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
halal                     11
kosher                    11
Name: diet, dtype: int64

#ok_cupid_df.loc[(ok_cupid_df['diet'] == 'unknown')&(ok_cupid_df['essay0'] == '57'), 'status'] = 'available'
ok_cupid_df.loc[ok_cupid_df['essay0'] == "im looking for someone to share some raging adhd. im a self motivated and light hearted superhero who enjoy's riding my bike everywhere and eating every goddamn thing i can.  im looking for someone to go adventuring with. i enjoy blind drunken adventures sometimes but you dont have to be a drinker. no vegans, i will eat anything... including people... especially hipsters. im not really a nerd (i don't play magic cards/excessive videogames) but i can like nerdy girls.  i just got this account, so gimmie some time to write down more shenanigans that are important  if u make chiptunes hit me the fuck up! i wanna make some!  i am awesome, eccentric, and energetic", 'diet'] = 'strictly anything'

ok_cupid_df.loc[ok_cupid_df['essay0'] == "rabid bibliophile, humorless feminist (that's a joke), eternal student. i like to write poetry on people, bake (vegan) cupcakes, make art and dress-up.  i identify as queer but my choices here are limited so i chose bisexual.  i am quiet, empathetic, and geeky", 'diet'] = 'vegan'

#ok_cupid_df[ok_cupid_df['diet'] == 'unknown_diet']['essay0'].tolist()

ok_cupid_df['status'].value_counts()

single            55697
seeing someone     2064
available          1865
married             310
unknown              10
Name: status, dtype: int64

ok_cupid_df["status"].replace({'unknown': 'unknown_status'}, inplace=True)

ok_cupid_df['status'].value_counts()

single            55697
seeing someone     2064
available          1865
married             310
unknown_status       10
Name: status, dtype: int64

ok_cupid_df['body_type'] = ok_cupid_df['body_type'].fillna('unknown_body_type')
ok_cupid_df['body_type'].value_counts()

average              14652
fit                  12711
athletic             11819
unknown_body_type     5296
thin                  4711
curvy                 3924
a little extra        2629
skinny                1777
full figured          1009
overweight             444
jacked                 421
used up                355
rather not say         198
Name: body_type, dtype: int64

ok_cupid_df['education'] = ok_cupid_df['education'].fillna('unknown_education')
ok_cupid_df['education'].value_counts()

graduated from college/university    23959
graduated from masters program        8961
unknown_education                     6628
working on college/university         5712
working on masters program            1683
graduated from two-year college       1531
graduated from high school            1428
graduated from ph.d program           1272
graduated from law school             1122
working on two-year college           1074
dropped out of college/university      995
working on ph.d program                983
college/university                     801
graduated from space camp              657
dropped out of space camp              523
graduated from med school              446
working on space camp                  445
working on law school                  269
two-year college                       222
working on med school                  212
dropped out of two-year college        191
dropped out of masters program         140
masters program                        136
dropped out of ph.d program            127
dropped out of high school             102
high school                             96
working on high school                  87
space camp                              58
ph.d program                            26
law school                              19
dropped out of law school               18
dropped out of med school               12
med school                              11
Name: education, dtype: int64

ok_cupid_df['job'] = ok_cupid_df['job'].fillna('unknown_job')
#ok_cupid_df['job'].value_counts()

ok_cupid_df['ethnicity'] = ok_cupid_df['ethnicity'].fillna('unknown_ethnicity')
#ok_cupid_df['ethnicity'].value_counts()

ok_cupid_df['offspring'] = ok_cupid_df['offspring'].fillna('unknown_offspring')
#ok_cupid_df['offspring'].value_counts()

ok_cupid_df['pets'] = ok_cupid_df['pets'].fillna('unknown_pets')
#ok_cupid_df['pets'].value_counts()

ok_cupid_df['religion'] = ok_cupid_df['religion'].fillna('unknown_religion')
#ok_cupid_df['religion'].value_counts()

ok_cupid_df['sign'] = ok_cupid_df['sign'].fillna('unknown_sign')
#ok_cupid_df['sign'].value_counts()

ok_cupid_df['smokes'] = ok_cupid_df['smokes'].fillna('unknown_smokes')
ok_cupid_df['smokes'].value_counts()

no                43896
unknown_smokes     5512
sometimes          3787
when drinking      3040
yes                2231
trying to quit     1480
Name: smokes, dtype: int64

ok_cupid_df['drinks'] = ok_cupid_df['drinks'].fillna('unknown_drinks')
ok_cupid_df['drinks'].value_counts()

socially          41780
rarely             5957
often              5164
not at all         3267
unknown_drinks     2985
very often          471
desperately         322
Name: drinks, dtype: int64

ok_cupid_df['speaks'] = ok_cupid_df['speaks'].fillna('unknown_speaks')
ok_cupid_df['speaks'].value_counts()

english                                                                                           21828
english (fluently)                                                                                 6628
english (fluently), spanish (poorly)                                                               2059
english (fluently), spanish (okay)                                                                 1917
english (fluently), spanish (fluently)                                                             1288
                                                                                                  ...  
english (fluently), french (poorly), polish (poorly), latin (poorly), italian (poorly)                1
english (fluently), hebrew (fluently), yiddish (fluently)                                             1
english (fluently), spanish (okay), catalan (poorly), italian (poorly)                                1
english (fluently), c++ (fluently), bengali (okay), french (poorly)                                   1
english (fluently), ancient greek (okay), spanish (fluently), french (poorly), hebrew (poorly)        1
Name: speaks, Length: 7648, dtype: int64

ok_cupid_df[ok_cupid_df['height'].isna()]

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	essay0	essay1	essay2	essay3	essay4	essay5	essay6	essay7	essay8	essay9
36428	32	single	f	straight	unknown_body_type	unknowndiet	unknown_drinks	unknown_drugs	unknown_education	other	...	NaN	NaN	NaN	NaN	thomas bernhard, foucault, annie hall, taxi dr...	NaN	consciousness	NaN	i passionately hate liars!	you know what my user name means and if you ar...
54002	25	single	m	straight	unknown_body_type	unknowndiet	unknown_drinks	never	unknown_education	hispanic / latin	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
58983	49	single	m	straight	unknown_body_type	unknowndiet	unknown_drinks	unknown_drugs	unknown_education	unknown_ethnicity	...	great guy, lots of positive attributes*, but s...	living it. quite a bit more than that - more ...	lots, notably good, deep, excellent communicat...	some positive stuff, but i'll hold my tongue o...	lots. not especially up to listing 'em here a...	1. damn good friend, or better 2. managing to ...	many things. maybe too much. not really up for...	at the moment, i'd rather not even say or thin...	i have a blog of much that's personal and priv...	you've good reason to think we'd like make at ...

3 rows × 31 columns

ok_cupid_df.isna().sum()

age                0
status             0
sex                0
orientation        0
body_type          0
diet               0
drinks             0
drugs              0
education          0
ethnicity          0
height             3
income             0
job                0
last_online        0
location           0
offspring          0
pets               0
religion           0
sign               0
smokes             0
speaks             0
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

Check for duplicalted rows:

ok_cupid_df[ok_cupid_df.duplicated()]

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	essay0	essay1	essay2	essay3	essay4	essay5	essay6	essay7	essay8	essay9

0 rows × 31 columns

Transform essays:

ok_cupid_df['essay0'] = ok_cupid_df['essay0'].fillna('')
ok_cupid_df['essay1'] = ok_cupid_df['essay1'].fillna('')
ok_cupid_df['essay2'] = ok_cupid_df['essay2'].fillna('')
ok_cupid_df['essay3'] = ok_cupid_df['essay3'].fillna('')
ok_cupid_df['essay4'] = ok_cupid_df['essay4'].fillna('')
ok_cupid_df['essay5'] = ok_cupid_df['essay5'].fillna('')
ok_cupid_df['essay6'] = ok_cupid_df['essay6'].fillna('')
ok_cupid_df['essay7'] = ok_cupid_df['essay7'].fillna('')
ok_cupid_df['essay8'] = ok_cupid_df['essay8'].fillna('')
ok_cupid_df['essay9'] = ok_cupid_df['essay9'].fillna('')

essay_columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
ok_cupid_df['essay'] = ok_cupid_df[essay_columns].apply(lambda row: ' '.join(row.values.astype(object)), axis = 1)
ok_cupid_df['essay'] = ok_cupid_df['essay'].str.replace('[^\w\s]','')

avg_words = pd.DataFrame()
avg_words['avg_words'] = ok_cupid_df['essay'].str.split().str.len()

avg_words.describe().T

	count	mean	std	min	25%	50%	75%	max
avg_words	59946.0	353.896757	293.270595	0.0	158.0	296.0	477.0	10486.0

ok_cupid_df['essay_len'] = ok_cupid_df['essay'].str.len()

ok_cupid_df = ok_cupid_df.drop(columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'])

ok_cupid_df.head(2)

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	last_online	location	offspring	pets	religion	sign	smokes	speaks	essay	essay_len
0	22	single	m	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	...	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	sometimes	english	about me i would love to think that i was som...	2389
1	35	single	m	straight	average	mostly other	often	sometimes	working on space camp	white	...	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	no	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340

2 rows × 23 columns

ok_cupid_df.isna().sum()

age            0
status         0
sex            0
orientation    0
body_type      0
diet           0
drinks         0
drugs          0
education      0
ethnicity      0
height         3
income         0
job            0
last_online    0
location       0
offspring      0
pets           0
religion       0
sign           0
smokes         0
speaks         0
essay          0
essay_len      0
dtype: int64

Drop 3 rows with null values:

ok_cupid_df.dropna(inplace=True)

ok_cupid_df[ok_cupid_df['age']>100]

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	last_online	location	offspring	pets	religion	sign	smokes	speaks	essay	essay_len
2512	110	single	f	straight	unknown_body_type	unknowndiet	unknown_drinks	unknown_drugs	unknown_education	unknown_ethnicity	...	2012-06-27-22-16	daly city, california	unknown_offspring	unknown_pets	unknown_religion	unknown_sign	unknown_smokes	english		9
25324	109	available	m	straight	athletic	mostly other	unknown_drinks	never	working on masters program	unknown_ethnicity	...	2012-06-30-18-18	san francisco, california	might want kids	unknown_pets	other and somewhat serious about it	aquarius but it doesn’t matter	when drinking	english (okay)	nothing	16

2 rows × 23 columns

#ok_cupid_df = ok_cupid_df.drop(ok_cupid_df.index[[, 25324]])
ok_cupid_df = ok_cupid_df.drop(index = [2512, 25324])

ok_cupid_df[ok_cupid_df['age']>100]

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	last_online	location	offspring	pets	religion	sign	smokes	speaks	essay	essay_len

0 rows × 23 columns

Check duplicated columns:

ok_cupid_df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

ok_cupid_df[ok_cupid_df.duplicated()]

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	last_online	location	offspring	pets	religion	sign	smokes	speaks	essay	essay_len

0 rows × 23 columns

ok_cupid_df['income'].value_counts()

-1          48437
 20000       2952
 100000      1621
 80000       1111
 30000       1048
 40000       1005
 50000        975
 60000        736
 70000        707
 150000       631
 1000000      521
 250000       149
 500000        48
Name: income, dtype: int64

ok_cupid_df.shape

(59941, 23)

#for column in ok_cupid_df.columns:
    
#    plt.figure()
#    plt.hist(ok_cupid_df[column], bins=25)
#    plt.title(f'Histogram of {column}')
#    plt.show()

Add binary column for gender (and drop existing sex column):

valid_income_df = ok_cupid_df[ok_cupid_df['income'] != -1]

male_df_income = valid_income_df[valid_income_df['sex'] == 'm']
female_df_income = valid_income_df[valid_income_df['sex'] == 'f']

ok_cupid_df2 = ok_cupid_df.copy()

ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['thin'],'skinny')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['fit'],'athletic')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['unknown_body_type'],'rather not say')

# Add new binary column for gender
ok_cupid_df["male"] = np.where(ok_cupid_df["sex"]=="m", 1, 0)
ok_cupid_df.head(2)

	age	status	sex	orientation	body_type	diet	drinks	drugs	education	ethnicity	...	location	offspring	pets	religion	sign	smokes	speaks	essay	essay_len	male
0	22	single	m	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	...	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	sometimes	english	about me i would love to think that i was som...	2389	1
1	35	single	m	straight	average	mostly other	often	sometimes	working on space camp	white	...	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	no	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1

2 rows × 24 columns

# Drop the sex column
ok_cupid_df.drop(columns="sex", inplace=True)
ok_cupid_df.head(2)

	age	status	orientation	body_type	diet	drinks	drugs	education	ethnicity	height	...	location	offspring	pets	religion	sign	smokes	speaks	essay	essay_len	male
0	22	single	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	...	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	sometimes	english	about me i would love to think that i was som...	2389	1
1	35	single	straight	average	mostly other	often	sometimes	working on space camp	white	70.0	...	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	no	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1

2 rows × 23 columns

ok_cupid_df.reset_index(drop=True, inplace=True)

ok_cupid_df2.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay', 'essay_len'],
      dtype='object')

valid_income_df['education'] = valid_income_df['education'].replace({'working on space camp': 'space camp', 
                                                                     'working on college/university': 'college/university', 
                                                                     'graduated from college/university': 'college/university',
                                                                     'graduated from high school': 'high school',
                                                                     'dropped out of space camp': 'space camp',
                                                                     'graduated from space camp': 'space camp',
                                                                     'graduated from law school': 'law school',
                                                                     'graduated from masters program': 'masters program',
                                                                     'graduated from two-year college': 'two-year college',
                                                                     'working on med school': 'med school',
                                                                     'dropped out of high school': 'high school',
                                                                     'working on ph.d program': 'ph.d program',
                                                                     'graduated from ph.d program': 'ph.d program',
                                                                     'dropped out of college/university': 'college/university',
                                                                     'dropped out of two-year college': 'two-year college',
                                                                     'dropped out of med school': 'med school',
                                                                     'working on masters program': 'masters program',
                                                                     'working on two-year college': 'two-year college',
                                                                     'working on high school': 'high school',
                                                                     'graduated from med school': 'med school',
                                                                     'dropped out of masters program': 'masters program',
                                                                     'working on law school': 'law school',
                                                                     'dropped out of ph.d program': 'ph.d program',
                                                                     'dropped out of law school': 'law school'})

male_df_income = valid_income_df[valid_income_df['sex'] == 'm']
female_df_income = valid_income_df[valid_income_df['sex'] == 'f']

means_male = []
medians_male = []
for education in valid_income_df['education'].unique():
    means_male.append(male_df_income[male_df_income['education'] == education]['income'].mean())
    medians_male.append(male_df_income[male_df_income['education'] == education]['income'].median())
    
list_of_tuples_male = list(zip(male_df_income['education'].unique(), means_male, medians_male))

df_mean_incomes_male = pd.DataFrame(list_of_tuples_male, columns=['education', 'mean_income', 'median_income'])

means_female = []
medians_female = []
for education in valid_income_df['education'].unique():
    means_female.append(female_df_income[female_df_income['education'] == education]['income'].mean())
    medians_female.append(female_df_income[female_df_income['education'] == education]['income'].median())
    
list_of_tuples_female = list(zip(female_df_income['education'].unique(), means_female, medians_female))

df_mean_incomes_female = pd.DataFrame(list_of_tuples_female, columns=['education', 'mean_income', 'median_income'])

df_mean_incomes_male_mean = df_mean_incomes_male.sort_values(by = 'mean_income')
df_mean_incomes_male_median = df_mean_incomes_male.sort_values(by = 'median_income')

df_mean_incomes_female_mean = df_mean_incomes_female.sort_values(by = 'mean_income')
df_mean_incomes_female_median = df_mean_incomes_female.sort_values(by = 'median_income')
#df_mean_incomes_male_mean

educations = valid_income_df['education'].unique()

fig = go.Figure(data=[
    go.Bar(name='Male', 
           y=educations, 
           x=df_mean_incomes_male_median['median_income'], 
           text=df_mean_incomes_male_median['median_income'], orientation='h'),
    go.Bar(name='Female', 
           y=educations, 
           x=df_mean_incomes_female_median['median_income'],
           text=df_mean_incomes_female_median['median_income'], orientation='h')
])

fig.update_layout(barmode='group',
    title='Median Income for each education',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Educations',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Median Income',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

fig = go.Figure()

fig.add_trace(go.Histogram(x=male_df_income['income'], name='Male'))
fig.add_trace(go.Histogram(x=female_df_income['income'], name='Female'))

fig.update_layout(barmode='overlay',
                  title='Distribution of Income for Males and Females',
                  #xaxis_tickfont_size=14,
                  yaxis=dict(title='Count',
                             titlefont_size=16,
                             tickfont_size=14),
                  xaxis=dict(title='Income',
                             titlefont_size=16,
                             tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

labels = ['Male', 'Female']
values = ok_cupid_df2['sex'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=labels, 
                             values=values, 
                             hole=.5)])
fig.update_layout(title='Male vs Female')
fig.show()

Create new dataframes for Male, Female, and Orientations:

male_df = ok_cupid_df2[ok_cupid_df2['sex'] == 'm']
female_df = ok_cupid_df2[ok_cupid_df2['sex'] == 'f']

ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({'agnosticism and very serious about it': 'agnosticism',
'agnosticism but not too serious about it' : 'agnosticism', 
'agnosticism and somewhat serious about it': 'agnosticism',
'agnosticism and laughing about it': 'agnosticism','agnosticism': 'agnostic'})

ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({'atheism and laughing about it':'atheism',
'atheism and somewhat serious about it': 'atheism', 
'atheism but not too serious about it': 'atheism', 'atheism and very serious about it':'atheism', 'atheism': 'atheist'})

for religion in ok_cupid_df2['religion'].unique():
    if ((religion != 'atheist') and (religion != 'agnostic') and (religion != 'unknown_religion')):
        ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({religion: 'religious'})

ok_cupid_df['religion_binary'] = ok_cupid_df['religion']

ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({'agnosticism and very serious about it': 'agnostic',
'agnosticism but not too serious about it' : 'agnostic', 
'agnosticism and somewhat serious about it': 'agnostic',
'agnosticism and laughing about it': 'agnosticism','agnosticism': 'agnostic'})

ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({'atheism and laughing about it':'atheism',
'atheism and somewhat serious about it': 'atheist', 
'atheism but not too serious about it': 'atheist', 'atheism and very serious about it':'atheist', 'atheism': 'atheist'})

for religion in ok_cupid_df['religion_binary'].unique():
    if ((religion != 'atheist') and (religion != 'agnostic') and (religion != 'unknown_religion')):
        ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({religion: 'religious'})

ok_cupid_df2['religion'].value_counts()

religious           34820
unknown_religion    20222
agnostic             2724
atheist              2175
Name: religion, dtype: int64

ok_cupid_df['religion_binary'].value_counts()

religious           28492
unknown_religion    20222
agnostic             6316
atheist              4911
Name: religion_binary, dtype: int64

ok_cupid_df.head(2)

	age	status	orientation	body_type	diet	drinks	drugs	education	ethnicity	height	...	offspring	pets	religion	sign	smokes	speaks	essay	essay_len	male	religion_binary
0	22	single	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	...	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	sometimes	english	about me i would love to think that i was som...	2389	1	agnostic
1	35	single	straight	average	mostly other	often	sometimes	working on space camp	white	70.0	...	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	no	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	agnostic

2 rows × 24 columns

ok_cupid_df['is_religious'] = np.where(ok_cupid_df['religion_binary'] == 'religious', 1, 0)
ok_cupid_df['is_agnostic'] = np.where(ok_cupid_df['religion_binary'] == 'agnostic', 1, 0)
ok_cupid_df['is_atheist'] = np.where(ok_cupid_df['religion_binary'] == 'atheist', 1, 0)

ok_cupid_df.head(2)

	age	status	orientation	body_type	diet	drinks	drugs	education	ethnicity	height	...	sign	smokes	speaks	essay	essay_len	male	religion_binary	is_religious	is_agnostic	is_atheist
0	22	single	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	...	gemini	sometimes	english	about me i would love to think that i was som...	2389	1	agnostic	0	1	0
1	35	single	straight	average	mostly other	often	sometimes	working on space camp	white	70.0	...	cancer	no	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	agnostic	0	1	0

2 rows × 27 columns

straight_df = ok_cupid_df[ok_cupid_df['orientation'] == 'straight']

non_straight_df = ok_cupid_df[ok_cupid_df['orientation'].isin(['bisexual', 'gay'])]
non_straight_df.head(2)

	age	status	orientation	body_type	diet	drinks	drugs	education	ethnicity	height	...	sign	smokes	speaks	essay	essay_len	male	religion_binary	is_religious	is_agnostic	is_atheist
37	25	single	bisexual	fit	mostly anything	socially	unknown_drugs	working on college/university	hispanic / latin, white	69.0	...	libra and it’s fun to think about	unknown_smokes	english (fluently), spanish (poorly)	lets go to a festival and dance all night runn...	3516	1	unknown_religion	0	0	0
44	29	single	bisexual	curvy	anything	socially	sometimes	graduated from masters program	white	66.0	...	aquarius and it’s fun to think about	no	english, spanish (poorly), portuguese (poorly)	i am an east coast transplant looking for fun ...	2259	0	religious	1	0	0

2 rows × 27 columns

ok_cupid_df['is_straight'] = np.where(ok_cupid_df['orientation'] == 'straight', 1, 0)
ok_cupid_df['is_not_straight'] = np.where(ok_cupid_df['orientation'] != 'straight', 1, 0)

ok_cupid_df.head(2)

	age	status	orientation	body_type	diet	drinks	drugs	education	ethnicity	height	...	speaks	essay	essay_len	male	religion_binary	is_religious	is_agnostic	is_atheist	is_straight	is_not_straight
0	22	single	straight	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	...	english	about me i would love to think that i was som...	2389	1	agnostic	0	1	0	1	0
1	35	single	straight	average	mostly other	often	sometimes	working on space camp	white	70.0	...	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	agnostic	0	1	0	1	0

2 rows × 29 columns

# relgns =ok_cupid_df['religion_binary'].unique()

# fig = go.Figure([go.Bar(x=relgns, y=ok_cupid_df["religion_binary"].value_counts().values, color=ok_cupid_df['is_straight'])])
# fig.show()

religious_views = ['religious', 'unknown_religion', 'agnostic', 'atheist']
fig = go.Figure(data=[
    go.Bar(name='Straight', 
           x=religious_views, 
           y=straight_df['religion'].value_counts().values, 
           text=straight_df['religion'].value_counts().values),
    go.Bar(name='Non-straight', 
           x=religious_views, 
           y=non_straight_df['religion'].value_counts().values,
           text=non_straight_df['religion'].value_counts().values)
])

fig.update_layout(barmode='group',
    title='Religions',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Religions',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

ok_cupid_df = ok_cupid_df.drop(columns=['is_not_straight', 'religion_binary'])

ok_cupid_df2.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay', 'essay_len'],
      dtype='object')

ok_cupid_df2.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay', 'essay_len'],
      dtype='object')

fig = go.Figure()

fig.add_trace(go.Histogram(x=male_df['age'], name='Male'))
fig.add_trace(go.Histogram(x=female_df['age'], name='Female'))

fig.update_layout(barmode='overlay',
                  title='Distribution of Age for Males and Females',
                  #xaxis_tickfont_size=14,
                  yaxis=dict(title='Count',
                             titlefont_size=16,
                             tickfont_size=14),
                  xaxis=dict(title='Age',
                             titlefont_size=16,
                             tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

fig = go.Figure()

fig.add_trace(go.Histogram(x=male_df['height'], name='Male'))
fig.add_trace(go.Histogram(x=female_df['height'], name='Female'))

fig.update_layout(barmode='overlay',
                  title='Distribution of Heights for Males and Females',
                  #xaxis_tickfont_size=14,
                  yaxis=dict(title='Count',
                             titlefont_size=16,
                             tickfont_size=14),
                  xaxis=dict(title='Height',
                             titlefont_size=16,
                             tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

orientations = ['straight', 'bisexual', 'gay']
fig = go.Figure(data=[
    go.Bar(name='Male', 
           x=orientations, 
           y=male_df['orientation'].value_counts().values, 
           text=male_df['orientation'].value_counts().values),
    go.Bar(name='Female', 
           x=orientations, 
           y=female_df['orientation'].value_counts().values,
           text=female_df['orientation'].value_counts().values)
])

fig.update_layout(barmode='group',
    title='Orientations',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Orientation',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

orientations = ['average', 'average', 'curvy', 'skinny', 'rather not say', 'overweight', 'jacked', 'used up']
fig = go.Figure(data=[
    go.Bar(name='Male', 
           y=orientations, 
           x=male_df['body_type'].value_counts().values, 
           text=male_df['body_type'].value_counts().values, orientation='h'),
    go.Bar(name='Female', 
           y=orientations, 
           x=female_df['body_type'].value_counts().values,
           text=female_df['body_type'].value_counts().values, orientation='h')
])

fig.update_layout(barmode='group',
    title='Body Types for Male and Female',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Body Types',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

orientations = ['single', 'available', 'seeing someone', 'married', 'unknown_status']
fig = go.Figure(data=[
    go.Bar(name='Male', 
           x=orientations, 
           y=male_df['status'].value_counts().values, 
           text=male_df['status'].value_counts().values),
    go.Bar(name='Female', 
           x=orientations, 
           y=female_df['status'].value_counts().values,
           text=female_df['status'].value_counts().values)
])

fig.update_layout(barmode='group',
    title='Status',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Status',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

Encoding:

Orientation, status, job

from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder
orientation_ohe = OneHotEncoder()
status_ohe = OneHotEncoder()
job_ohe = OneHotEncoder()

# Fit the OneHotEncoder and transform
orientation = pd.DataFrame(ok_cupid_df['orientation'])
orientation_encoded = orientation_ohe.fit_transform(orientation)
display(orientation_encoded)

status = pd.DataFrame(ok_cupid_df['status'])
status_encoded = status_ohe.fit_transform(status)
display(status_encoded)

job = pd.DataFrame(ok_cupid_df['job'])
job_encoded = job_ohe.fit_transform(job)
display(job_encoded)

<59941x3 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>

<59941x5 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>

<59941x22 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>

# Put into a dataframe to get column names
encoded_df_orientation = pd.DataFrame(orientation_encoded.toarray().astype(int), columns=orientation_ohe.categories_[0], dtype=int)
encoded_df_orientation = encoded_df_orientation.drop(encoded_df_orientation.columns[0], axis=1)
display(encoded_df_orientation.head(2))

# Status
encoded_df_status = pd.DataFrame(status_encoded.toarray().astype(int), columns=status_ohe.categories_[0], dtype=int)
encoded_df_status = encoded_df_status.drop(columns='unknown_status')
display(encoded_df_status.head(2))

# Jobs
encoded_df_job = pd.DataFrame(job_encoded.toarray().astype(int), columns=job_ohe.categories_[0], dtype=int)
encoded_df_job = encoded_df_job.drop(columns='unknown_job')
display(encoded_df_job.head(2))

	gay	straight
0	0	1
1	0	1

	available	married	seeing someone	single
0	0	0	0	1
1	0	0	0	1

	artistic / musical / writer	banking / financial / real estate	clerical / administrative	computer / hardware / software	construction / craftsmanship	education / academia	entertainment / media	executive / management	hospitality / travel	law / legal services	...	military	other	political / government	rather not say	retired	sales / marketing / biz dev	science / tech / engineering	student	transportation	unemployed
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
1	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 21 columns

encoded_df_job.rename(columns= {'other': 'otherjob'}, inplace = True)

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_orientation], axis=1)
ok_cupid_df.drop(columns='orientation', inplace=True)
ok_cupid_df.head(2)

	age	status	body_type	diet	drinks	drugs	education	ethnicity	height	income	...	speaks	essay	essay_len	male	is_religious	is_agnostic	is_atheist	is_straight	gay	straight
0	22	single	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	-1	...	english	about me i would love to think that i was som...	2389	1	0	1	0	1	0	1
1	35	single	average	mostly other	often	sometimes	working on space camp	white	70.0	80000	...	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	0	1	0	1	0	1

2 rows × 28 columns

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_status], axis=1)
ok_cupid_df.drop(columns='status', inplace=True)
ok_cupid_df.head(2)

	age	body_type	diet	drinks	drugs	education	ethnicity	height	income	job	...	is_religious	is_agnostic	is_atheist	is_straight	gay	straight	available	married	seeing someone	single
0	22	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	-1	transportation	...	0	1	0	1	0	1	0	0	0	1
1	35	average	mostly other	often	sometimes	working on space camp	white	70.0	80000	hospitality / travel	...	0	1	0	1	0	1	0	0	0	1

2 rows × 31 columns

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_job], axis=1)
ok_cupid_df.drop(columns='job', inplace=True)
ok_cupid_df.head(2)

	age	body_type	diet	drinks	drugs	education	ethnicity	height	income	last_online	...	military	otherjob	political / government	rather not say	retired	sales / marketing / biz dev	science / tech / engineering	student	transportation	unemployed
0	22	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	-1	2012-06-28-20-30	...	0	0	0	0	0	0	0	0	1	0
1	35	average	mostly other	often	sometimes	working on space camp	white	70.0	80000	2012-06-29-21-41	...	0	0	0	0	0	0	0	0	0	0

2 rows × 51 columns

Essays:

from sklearn.feature_extraction.text import CountVectorizer

# import the nltk stopwords
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Let's test it out
stemmer = nltk.stem.PorterStemmer()

ENGLISH_STOP_WORDS = stopwords.words('english')

def my_tokenizer(sentence):
    # remove punctuation and set to lower case
    # for punctuation_mark in string.punctuation:
    #     sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

Requirement already satisfied: nltk in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (3.6.2)
Requirement already satisfied: regex in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (2021.4.4)
Requirement already satisfied: joblib in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (1.0.1)
Requirement already satisfied: click in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (7.1.2)
Requirement already satisfied: tqdm in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (4.60.0)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneetsran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# 1. Instantiate 
# essay = CountVectorizer(stop_words="english", min_df=5, max_features=100, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
essay = CountVectorizer(min_df=5, max_features=100, tokenizer = my_tokenizer)

# 2. Fit 
essay.fit(ok_cupid_df["essay"])

# 3. Transform
essay_transformed = essay.transform(ok_cupid_df["essay"])
essay_transformed

<59941x100 sparse matrix of type '<class 'numpy.int64'>'
	with 2153638 stored elements in Compressed Sparse Row format>

essay_df = pd.DataFrame(columns=essay.get_feature_names(), data=essay_transformed.toarray())
#essay_df = essay0_df.drop(essay_df.columns[0], axis=1)
essay_df.head(2)

	adventur	also	alway	anyth	around	art	back	big	book	citi	...	want	watch	way	well	work	world	would	write	year	your
0	0	0	0	2	0	0	0	0	3	0	...	4	0	3	0	3	0	2	0	1	0
1	1	0	1	1	1	0	0	1	0	0	...	0	1	0	0	1	0	0	0	0	0

2 rows × 100 columns

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, essay_df], axis=1)
ok_cupid_df.head(2)

	age	body_type	diet	drinks	drugs	education	ethnicity	height	income	last_online	...	want	watch	way	well	work	world	would	write	year	your
0	22	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	-1	2012-06-28-20-30	...	4	0	3	0	3	0	2	0	1	0
1	35	average	mostly other	often	sometimes	working on space camp	white	70.0	80000	2012-06-29-21-41	...	0	1	0	0	1	0	0	0	0	0

2 rows × 151 columns

# 1. Instantiate 
ethnicity = CountVectorizer()

# 2. Fit 
ethnicity.fit(ok_cupid_df["ethnicity"])

# 3. Transform
ethnicity_transformed = ethnicity.transform(ok_cupid_df["ethnicity"])
ethnicity_transformed

<59941x14 sparse matrix of type '<class 'numpy.int64'>'
	with 78195 stored elements in Compressed Sparse Row format>

ethnicity_transformed.toarray()

array([[0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

ethnicity_df = pd.DataFrame(columns=ethnicity.get_feature_names(), data=ethnicity_transformed.toarray())

unknown_ethnicity = ethnicity_df[ethnicity_df['unknown_ethnicity'] == 1]['unknown_ethnicity']
unknown_ethnicity = unknown_ethnicity.sum()

# Drop one column to prevent redundant information
ethnicity_df = ethnicity_df.drop(columns='unknown_ethnicity')

ethnicity_df = ethnicity_df.rename(columns={'american': 'native_american', 'eastern': 'middle_eastern', 'islander': 'pacific_islander', 'hispanic': 'hispanic_latin'})
ethnicity_df = ethnicity_df.drop(columns=['native', 'middle', 'pacific', 'latin'])
ethnicity_df.head(2)

	native_american	asian	black	middle_eastern	hispanic_latin	indian	pacific_islander	other	white
0	0	1	0	0	0	0	0	0	1
1	0	0	0	0	0	0	0	0	1

ethnicity_df.rename(columns = {'other':'other_ethnicity'}, inplace = True)

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, ethnicity_df], axis=1)
ok_cupid_df.head(2)

	age	body_type	diet	drinks	drugs	education	ethnicity	height	income	last_online	...	your	native_american	asian	black	middle_eastern	hispanic_latin	indian	pacific_islander	other_ethnicity	white
0	22	a little extra	strictly anything	socially	never	working on college/university	asian, white	75.0	-1	2012-06-28-20-30	...	0	0	1	0	0	0	0	0	0	1
1	35	average	mostly other	often	sometimes	working on space camp	white	70.0	80000	2012-06-29-21-41	...	0	0	0	0	0	0	0	0	0	1

2 rows × 160 columns

ethnicity_df['native_american'].value_counts()[1]

native_american = ethnicity_df[ethnicity_df['native_american'] == 1]['native_american']
native_american = native_american.sum()

asian = ethnicity_df[ethnicity_df['asian'] == 1]['asian']
asian = asian.sum()

black = ethnicity_df[ethnicity_df['black'] == 1]['black']
black = black.sum()

middle_eastern = ethnicity_df[ethnicity_df['middle_eastern'] == 1]['middle_eastern']
middle_eastern = middle_eastern.sum()

hispanic_latin = ethnicity_df[ethnicity_df['hispanic_latin'] == 1]['hispanic_latin']
hispanic_latin = hispanic_latin.sum()

indian = ethnicity_df[ethnicity_df['indian'] == 1]['indian']
indian = indian.sum()

pacific_islander = ethnicity_df[ethnicity_df['pacific_islander'] == 1]['pacific_islander']
pacific_islander = pacific_islander.sum()

other_ethnicity = ethnicity_df[ethnicity_df['other_ethnicity'] == 1]['other_ethnicity']
other_ethnicity = other_ethnicity.sum()

white = ethnicity_df[ethnicity_df['white'] == 1]['white']
white = white.sum()

# intialise data of lists.
sum_ethnicities = {'ethnicity':['native american', 'asian', 'black',
                                'middle eastern', 'hispanic/latin',
                                'indian', 'pacific islander',
                                'other_ethnicity', 'white', 'unknown_ethnicity'],
        'sum':[native_american, asian, black, middle_eastern,
               hispanic_latin, indian, pacific_islander, other_ethnicity, 
               white, unknown_ethnicity]}
  
# Create DataFrame
ethnicities_sum = pd.DataFrame(sum_ethnicities)

ethnicities_sum['sum'].unique()

array([ 1265,  8205,  3328,   950,  5356,  1449,  1473,  3566, 37882,
        5677])

#fig = px.bar(ethnicities_sum, x='ethnicity', y='sum')
#fig.show()

labels = ethnicities_sum['ethnicity'].unique()
values = ethnicities_sum['sum'].unique()
fig = go.Figure(data=[go.Pie(labels=labels, 
                             values=values, 
                             hole=.5)])
fig.update_layout(title='Ethinicities')
fig.show()

ok_cupid_df.drop(columns='ethnicity', inplace=True)

Body type

ok_cupid_df['body_type'].value_counts()

average              14652
fit                  12711
athletic             11818
unknown_body_type     5292
thin                  4711
curvy                 3924
a little extra        2629
skinny                1777
full figured          1009
overweight             444
jacked                 421
used up                355
rather not say         198
Name: body_type, dtype: int64

ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['thin'],'skinny')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['fit'],'athletic')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['unknown_body_type'],'rather not say')

ok_cupid_df['body_type'].value_counts()

athletic          24529
average           14652
curvy              7562
skinny             6488
rather not say     5490
overweight          444
jacked              421
used up             355
Name: body_type, dtype: int64

body_type_mapping = {'overweight':0, 'curvy':1, 'average':2, 'used up':3, 'rather not say': 4, 'skinny': 5, 'athletic': 6, 'jacked': 7}
body_type_mapped_data = ok_cupid_df['body_type'].map(body_type_mapping)

#ok_cupid_df = ok_cupid_df.drop(columns=['body_type'])
ok_cupid_df.drop(columns='body_type', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, body_type_mapped_data], axis=1)
ok_cupid_df.head(2)

	age	diet	drinks	drugs	education	height	income	last_online	location	offspring	...	native_american	asian	black	middle_eastern	hispanic_latin	indian	pacific_islander	other_ethnicity	white	body_type
0	22	strictly anything	socially	never	working on college/university	75.0	-1	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	...	0	1	0	0	0	0	0	0	1	1
1	35	mostly other	often	sometimes	working on space camp	70.0	80000	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	...	0	0	0	0	0	0	0	0	1	2

2 rows × 159 columns

Drinks

ok_cupid_df['drinks'].value_counts()

socially          41780
rarely             5957
often              5164
not at all         3267
unknown_drinks     2980
very often          471
desperately         322
Name: drinks, dtype: int64

drinks_mapping = {'desperately':6, 'very often':5, 'often':4, 'unknown_drinks':3, 'socially': 2, 'rarely': 1, 'not at all': 0}
drinks_mapped_data = ok_cupid_df['drinks'].map(drinks_mapping)

ok_cupid_df = ok_cupid_df.drop(columns=['drinks'])
ok_cupid_df = pd.concat([ok_cupid_df, drinks_mapped_data], axis=1)
ok_cupid_df.head(2)

	age	diet	drugs	education	height	income	last_online	location	offspring	pets	...	asian	black	middle_eastern	hispanic_latin	indian	pacific_islander	other_ethnicity	white	body_type	drinks
0	22	strictly anything	never	working on college/university	75.0	-1	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	...	1	0	0	0	0	0	0	1	1	2
1	35	mostly other	sometimes	working on space camp	70.0	80000	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	...	0	0	0	0	0	0	0	1	2	4

2 rows × 159 columns

Diet

ok_cupid_df['diet'].value_counts()

unknowndiet            24389
mostly anything        16585
anything                6183
strictly anything       5114
mostly vegetarian       3444
mostly other            1006
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    137
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
halal                     11
kosher                    11
Name: diet, dtype: int64

#ok_cupid_df["diet"] = ok_cupid_df["diet"].replace({'anything': 'strictly anything', 'vegetarian': 'strictly vegetarian', 'other': 'strictly other',
 #                            'vegan': 'strictly vegan', 'kosher': 'strictly kosher', 'halal': 'strictly halal'})
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()

unknowndiet           24389
mostlyanything        16585
anything               6183
strictlyanything       5114
mostlyvegetarian       3444
mostlyother            1006
strictlyvegetarian      875
vegetarian              667
strictlyother           452
mostlyvegan             338
other                   331
strictlyvegan           228
vegan                   137
mostlykosher             86
mostlyhalal              48
strictlykosher           18
strictlyhalal            18
kosher                   11
halal                    11
Name: diet, dtype: int64

ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace('other', 'otherdiet')
#ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()

unknowndiet           24389
mostlyanything        16585
anything               6183
strictlyanything       5114
mostlyvegetarian       3444
mostlyotherdiet        1006
strictlyvegetarian      875
vegetarian              667
strictlyotherdiet       452
mostlyvegan             338
otherdiet               331
strictlyvegan           228
vegan                   137
mostlykosher             86
mostlyhalal              48
strictlykosher           18
strictlyhalal            18
kosher                   11
halal                    11
Name: diet, dtype: int64

diet = CountVectorizer()

diet_transformed = diet.fit_transform(ok_cupid_df["diet"])
diet_transformed

<59941x19 sparse matrix of type '<class 'numpy.int64'>'
	with 59941 stored elements in Compressed Sparse Row format>

diet_df = pd.DataFrame(columns=diet.get_feature_names(), data=diet_transformed.toarray())

# Drop one column to prevent redundant information
diet_df = diet_df.drop(columns=['unknowndiet'])
diet_df.head(2)

	anything	halal	kosher	mostlyanything	mostlyhalal	mostlykosher	mostlyotherdiet	mostlyvegan	mostlyvegetarian	otherdiet	strictlyanything	strictlyhalal	strictlykosher	strictlyotherdiet	strictlyvegan	strictlyvegetarian	vegan	vegetarian
0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0
1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0

diff_diets = diet_df.columns.tolist()

for d in diff_diets:
    if d.startswith('mostly'):
        diff_diets.remove(d)

for d in diff_diets:
    if d.startswith('strictly'):
        diff_diets.remove(d)

diets = diff_diets

diets_mostly = []
for diet in diets:
    diets_mostly.append('mostly'+diet)
    
diets_strictly = []
for diet in diets:
    diets_strictly.append('strictly'+diet)

for diet in diets:
    if diet in diet_df.columns:
        diet_df.loc[diet_df[diet] == 1, diet] = 2
        
for diet, diet_mostly in zip(diets, diets_mostly):
    if diet_mostly in diet_df.columns:
        diet_df.loc[diet_df[diet_mostly] == 1, diet] = 1
        diet_df = diet_df.drop(columns=diet_mostly)

for diet, diet_strictly in zip(diets, diets_strictly):
    if diet_strictly in diet_df.columns:
        diet_df.loc[diet_df[diet_strictly] == 1, diet] = 3
        diet_df = diet_df.drop(columns=diet_strictly)

diet_df.head(2)

	anything	halal	kosher	otherdiet	vegan	vegetarian
0	3	0	0	0	0	0
1	0	0	0	0	0	0

diet_df.rename(columns = {'diet':'otherdiet'}, inplace = True)

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, diet_df], axis=1)
ok_cupid_df.drop(columns='diet', inplace=True)
ok_cupid_df.head(2)

	age	drugs	education	height	income	last_online	location	offspring	pets	religion	...	other_ethnicity	white	body_type	drinks	anything	halal	kosher	otherdiet	vegan	vegetarian
0	22	never	working on college/university	75.0	-1	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	...	0	1	1	2	3	0	0	0	0	0
1	35	sometimes	working on space camp	70.0	80000	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	...	0	1	2	4	0	0	0	0	0	0

2 rows × 164 columns

Drugs

ok_cupid_df['drugs'].value_counts()

never            37722
unknown_drugs    14077
sometimes         7732
often              410
Name: drugs, dtype: int64

drugs_mapping = {'often':3, 'sometimes':2, 'unknown_drugs':1, 'never':0}
drugs_mapped_data = ok_cupid_df['drugs'].map(drugs_mapping)
drugs_mapped_data = drugs_mapped_data.astype(int)

ok_cupid_df = ok_cupid_df.drop(columns=['drugs'])
ok_cupid_df = pd.concat([ok_cupid_df, drugs_mapped_data], axis=1)
ok_cupid_df.head(2)

	age	education	height	income	last_online	location	offspring	pets	religion	sign	...	white	body_type	drinks	anything	halal	kosher	otherdiet	vegan	vegetarian	drugs
0	22	working on college/university	75.0	-1	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	...	1	1	2	3	0	0	0	0	0	0
1	35	working on space camp	70.0	80000	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	...	1	2	4	0	0	0	0	0	0	2

2 rows × 164 columns

Education:

ok_cupid_df['education'].value_counts()

graduated from college/university    23959
graduated from masters program        8961
unknown_education                     6624
working on college/university         5712
working on masters program            1682
graduated from two-year college       1531
graduated from high school            1428
graduated from ph.d program           1272
graduated from law school             1122
working on two-year college           1074
dropped out of college/university      995
working on ph.d program                983
college/university                     801
graduated from space camp              657
dropped out of space camp              523
graduated from med school              446
working on space camp                  445
working on law school                  269
two-year college                       222
working on med school                  212
dropped out of two-year college        191
dropped out of masters program         140
masters program                        136
dropped out of ph.d program            127
dropped out of high school             102
high school                             96
working on high school                  87
space camp                              58
ph.d program                            26
law school                              19
dropped out of law school               18
dropped out of med school               12
med school                              11
Name: education, dtype: int64

education_mapping = {'dropped out of space camp':0, 'working on space camp':1, 'graduated from space camp':2,
                     'space camp': 2,
                     'dropped out of high school':3, 'working on high school': 4, 'graduated from high school': 5,
                     'high school': 5,
                     'dropped out of two-year college': 6, 'working on two-year college': 7,
                     'two-year college': 8,
                     'graduated from two-year college': 8, 'dropped out of college/university': 9,
                     'college/university': 11,
                     'working on college/university': 10, 'graduated from college/university': 11, 'unknown_education': 12,
                     'dropped out of masters program': 13, 'working on masters program': 14,
                     'masters program': 15,
                     'graduated from masters program': 15, 'dropped out of law school': 16,
                     'working on law school': 17, 'graduated from law school': 18,
                     'law school': 18,
                     'dropped out of ph.d program': 16, 'working on ph.d program': 17,
                     'ph.d program': 18,
                     'graduated from ph.d program': 18, 'dropped out of med school': 16, 
                     'working on med school': 17, 'graduated from med school': 18, 'med school': 18}
education_mapped_data = ok_cupid_df['education'].map(education_mapping)
education_mapped_data = education_mapped_data.astype(int)

ok_cupid_df = ok_cupid_df.drop(columns=['education'])
ok_cupid_df = pd.concat([ok_cupid_df, education_mapped_data], axis=1)
ok_cupid_df.head(2)

	age	height	income	last_online	location	offspring	pets	religion	sign	smokes	...	body_type	drinks	anything	halal	kosher	otherdiet	vegan	vegetarian	drugs	education
0	22	75.0	-1	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	sometimes	...	1	2	3	0	0	0	0	0	0	10
1	35	70.0	80000	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	no	...	2	4	0	0	0	0	0	0	2	1

2 rows × 164 columns

Smokes:

ok_cupid_df['smokes'].value_counts()

no                43895
unknown_smokes     5509
sometimes          3787
when drinking      3039
yes                2231
trying to quit     1480
Name: smokes, dtype: int64

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder # This is used for multiple columns

# Instantiate the label encoder
le = LabelEncoder()

# Fit and transform the order priority column
le.fit_transform(ok_cupid_df['smokes'])

array([1, 0, 0, ..., 0, 2, 1])

le.classes_

array(['no', 'sometimes', 'trying to quit', 'unknown_smokes',
       'when drinking', 'yes'], dtype=object)

smokes_mapping = {'yes':4, 'sometimes':3, 'when drinking':3, 'unknown_smokes':2, 'trying to quit':1, 'no': 0}
smokes_mapped_data = ok_cupid_df['smokes'].map(smokes_mapping)
smokes_mapped_data = smokes_mapped_data.astype(int)

ok_cupid_df = ok_cupid_df.drop(columns=['smokes'])
ok_cupid_df = pd.concat([ok_cupid_df, smokes_mapped_data], axis=1)
ok_cupid_df.head(2)

	age	height	income	last_online	location	offspring	pets	religion	sign	speaks	...	drinks	anything	halal	kosher	otherdiet	vegan	vegetarian	drugs	education	smokes
0	22	75.0	-1	2012-06-28-20-30	south san francisco, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	english	...	2	3	0	0	0	0	0	0	10	3
1	35	70.0	80000	2012-06-29-21-41	oakland, california	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...	...	4	0	0	0	0	0	0	2	1	0

2 rows × 164 columns

Location:

ok_cupid_df['location'].str.split(',', expand=True).head(2)

	0	1	2
0	south san francisco	california	None
1	oakland	california	None

ok_cupid_df['location'].str.split(',', expand=True)[1].unique()

array([' california', ' colorado', ' new york', ' oregon', ' arizona',
       ' hawaii', ' montana', ' wisconsin', ' virginia', ' spain',
       ' nevada', ' illinois', ' vietnam', ' ireland', ' louisiana',
       ' michigan', ' texas', ' united kingdom', ' massachusetts',
       ' north carolina', ' idaho', ' mississippi', ' new jersey',
       ' florida', ' minnesota', ' georgia', ' utah', ' washington',
       ' west virginia', ' connecticut', ' tennessee', ' rhode island',
       ' district of columbia', ' british columbia', ' missouri',
       ' germany', ' pennsylvania', ' netherlands', ' switzerland',
       ' ohio'], dtype=object)

ok_cupid_df['city'] = ok_cupid_df['location'].str.split(',', expand=True)[0]

ok_cupid_df_copy_city = ok_cupid_df.copy()

city_data = {'city':ok_cupid_df_copy_city['city'].unique()} 
    
# Convert the dictionary into DataFrame 
cities_df = pd.DataFrame(city_data) 

from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

longitude = []
latitude = []
def findGeocode(city):
    try:
        geolocator = Nominatim(user_agent="your_app_name")
        return geolocator.geocode(city)
    except GeocoderTimedOut:
        return findGeocode(city)	

for i in (cities_df["city"]):
    if findGeocode(i) != None:
        loc = findGeocode(i)
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
    
    else:
        latitude.append(np.nan)
        longitude.append(np.nan)

cities_df["longitude"] = longitude
cities_df["latitude"] = latitude
  
cities_df.head(2)

	city	longitude	latitude
0	south san francisco	-122.416866	37.653540
1	oakland	-122.271356	37.804456

#cities_df.rename(columns = {'City':'city'}, inplace = True)
df_cities = ok_cupid_df_copy_city['city'].value_counts().rename_axis('city').reset_index(name='counts')
df_cities.head(2)

	city	counts
0	san francisco	31063
1	oakland	7214

df_cities = pd.merge(cities_df, df_cities, on='city', how='inner')

df_cities.head(2)

	city	longitude	latitude	counts
0	south san francisco	-122.416866	37.653540	416
1	oakland	-122.271356	37.804456	7214

fig = px.scatter_mapbox(df_cities, lat="latitude", lon="longitude", 
                        size='counts',
                        color='city',
                        color_continuous_scale=px.colors.cyclical.IceFire, 
                        #size_max=15, zoom=10,
                        mapbox_style="carto-positron"
                       )
fig.show()

ok_cupid_df['state'] = ok_cupid_df['location'].str.split(',', expand=True)[1]
ok_cupid_df['state'] = ok_cupid_df['state'].str.lstrip(' ')
ok_cupid_df['state'].unique()

array(['california', 'colorado', 'new york', 'oregon', 'arizona',
       'hawaii', 'montana', 'wisconsin', 'virginia', 'spain', 'nevada',
       'illinois', 'vietnam', 'ireland', 'louisiana', 'michigan', 'texas',
       'united kingdom', 'massachusetts', 'north carolina', 'idaho',
       'mississippi', 'new jersey', 'florida', 'minnesota', 'georgia',
       'utah', 'washington', 'west virginia', 'connecticut', 'tennessee',
       'rhode island', 'district of columbia', 'british columbia',
       'missouri', 'germany', 'pennsylvania', 'netherlands',
       'switzerland', 'ohio'], dtype=object)

ok_cupid_df['country'] = 'united states'

ok_cupid_df.loc[ok_cupid_df['state'] == 'spain', 'state'] = 'madrid'
ok_cupid_df.loc[ok_cupid_df['city'] == 'madrid', 'country'] = 'spain'

ok_cupid_df.loc[ok_cupid_df['state'] == 'vietnam', 'state'] = 'khanh hoa'
ok_cupid_df.loc[ok_cupid_df['city'] == 'nha trang', 'country'] = 'vietnam'

ok_cupid_df.loc[ok_cupid_df['state'] == 'ireland', 'state'] = 'munster'
ok_cupid_df.loc[ok_cupid_df['city'] == 'cork', 'country'] = 'ireland'

ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'state'] = 'scotland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'country'] = 'united kingdom'

ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'state'] = 'england'
ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'country'] = 'united kingdom'

ok_cupid_df.loc[ok_cupid_df['state'] == 'germany', 'state'] = 'hessen'
ok_cupid_df.loc[ok_cupid_df['city'] == 'kassel', 'country'] = 'germany'

ok_cupid_df.loc[ok_cupid_df['state'] == 'netherlands', 'state'] = 'north holland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'amsterdam', 'country'] = 'netherlands'

ok_cupid_df.loc[ok_cupid_df['state'] == 'switzerland', 'state'] = 'graubunden'
ok_cupid_df.loc[ok_cupid_df['city'] == 'bonaduz', 'country'] = 'switzerland'

ok_cupid_df.loc[ok_cupid_df['city'] == 'vancouver', 'country'] = 'canada'

ok_cupid_df = ok_cupid_df.drop(columns=['location'])

ok_cupid_df[ok_cupid_df['country'] == 'canada']

	age	height	income	last_online	offspring	pets	religion	sign	speaks	essay	...	kosher	otherdiet	vegan	vegetarian	drugs	education	smokes	city	state	country
42435	32	63.0	60000	2012-06-28-18-38	doesn't have kids	unknown_pets	other	aquarius	english (fluently), chinese (poorly), french (...	im happiest when wearing sunglasses and flipfl...	...	0	0	0	0	0	1	0	vancouver	british columbia	canada

1 rows × 166 columns

ok_cupid_df['state'].unique()

array(['california', 'colorado', 'new york', 'oregon', 'arizona',
       'hawaii', 'montana', 'wisconsin', 'virginia', 'madrid', 'nevada',
       'illinois', 'khanh hoa', 'munster', 'louisiana', 'michigan',
       'texas', 'scotland', 'england', 'massachusetts', 'north carolina',
       'idaho', 'mississippi', 'new jersey', 'florida', 'minnesota',
       'georgia', 'utah', 'washington', 'west virginia', 'connecticut',
       'tennessee', 'rhode island', 'district of columbia',
       'british columbia', 'missouri', 'hessen', 'pennsylvania',
       'north holland', 'graubunden', 'ohio'], dtype=object)

ok_cupid_df['country'].unique()

array(['united states', 'spain', 'vietnam', 'ireland', 'united kingdom',
       'canada', 'germany', 'netherlands', 'switzerland'], dtype=object)

# Instantiate the OneHotEncoder
city_ohe = OneHotEncoder()
state_ohe = OneHotEncoder()
country_ohe = OneHotEncoder()

# Fit the OneHotEncoder to the subcategory column and transform
# Expects a 2D array
city = pd.DataFrame(ok_cupid_df['city'])
city_encoded = city_ohe.fit_transform(city)
display(city_encoded)

state = pd.DataFrame(ok_cupid_df['state'])
state_encoded = state_ohe.fit_transform(state)
display(state_encoded)

country = pd.DataFrame(ok_cupid_df['country'])
country_encoded = country_ohe.fit_transform(country)
display(country_encoded)

<59941x197 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>

<59941x41 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>

<59941x9 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>

# Put into a dataframe to get column names
encoded_df_city = pd.DataFrame(city_encoded.toarray().astype(int), columns=city_ohe.categories_[0], dtype=int)
encoded_df_city = encoded_df_city.drop(encoded_df_city.columns[0], axis=1)
display(encoded_df_city.head(2))

# Status
encoded_df_state = pd.DataFrame(state_encoded.toarray().astype(int), columns=state_ohe.categories_[0], dtype=int)
encoded_df_state = encoded_df_state.drop(encoded_df_state.columns[0], axis=1)
display(encoded_df_state.head(2))

# Jobs
encoded_df_country = pd.DataFrame(country_encoded.toarray().astype(int), columns=country_ohe.categories_[0], dtype=int)
encoded_df_country = encoded_df_country.drop(encoded_df_country.columns[0], axis=1)
display(encoded_df_country.head(2))

	albany	amsterdam	arcadia	asheville	ashland	astoria	atherton	atlanta	austin	bayshore	...	vallejo	vancouver	walnut creek	washington	waterford	west oakland	westlake	woodacre	woodbridge	woodside
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 196 columns

	british columbia	california	colorado	connecticut	district of columbia	england	florida	georgia	graubunden	hawaii	...	pennsylvania	rhode island	scotland	tennessee	texas	utah	virginia	washington	west virginia	wisconsin
0	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 40 columns

	germany	ireland	netherlands	spain	switzerland	united kingdom	united states	vietnam
0	0	0	0	0	0	0	1	0
1	0	0	0	0	0	0	1	0

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_city], axis=1)
ok_cupid_df.drop(columns='city', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_state], axis=1)
ok_cupid_df.drop(columns='state', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_country], axis=1)
ok_cupid_df.drop(columns='country', inplace=True)

ok_cupid_df.head(2)

	age	height	income	last_online	offspring	pets	religion	sign	speaks	essay	...	west virginia	wisconsin	germany	ireland	netherlands	spain	switzerland	united kingdom	united states	vietnam
0	22	75.0	-1	2012-06-28-20-30	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism and very serious about it	gemini	english	about me i would love to think that i was som...	...	0	0	0	0	0	0	0	0	1	0
1	35	70.0	80000	2012-06-29-21-41	doesn't have kids, but might want them	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	...	0	0	0	0	0	0	0	0	1	0

2 rows × 407 columns

Offspring:

ok_cupid_df['offspring'].value_counts()

unknown_offspring                          35558
doesn't have kids                           7559
doesn't have kids, but might want them      3875
doesn't have kids, but wants them           3565
doesn't want kids                           2927
has kids                                    1883
has a kid                                   1881
doesn't have kids, and doesn't want any     1132
has kids, but doesn't want more              442
has a kid, but doesn't want more             275
has a kid, and might want more               231
wants kids                                   225
might want kids                              181
has kids, and might want more                115
has a kid, and wants more                     71
has kids, and wants more                      21
Name: offspring, dtype: int64

ok_cupid_df['offspring'] = ok_cupid_df['offspring'].replace({"doesn't have kids": "doesnt_have_kids",
                                                             "doesn't have kids, but might want them": "doesnt_have_kids might_want_kids",
                                                             "doesn't have kids, but wants them": "doesnt_have_kids wants_kids",
                                                             "doesn't want kids": "doesnt_want_kids",
                                                             "has kids": "has_kids",
                                                             "has a kid": "has_a_kid",
                                                             "doesn't have kids, and doesn't want any": "doesnt_have_kids doesnt_want_kids",
                                                             "has kids, but doesn't want more": "has_kids doesnt_want_kids",
                                                             "has a kid, but doesn't want more": "has_a_kid doesnt_want_kids",
                                                             "has a kid, and might want more": "has_a_kid might_want_kids",
                                                             "wants kids": "wants_kids",
                                                             "might want kids": "might_want_kids",
                                                             "has kids, and might want more": "has_kids might_want_kids",
                                                             "has a kid, and wants more": "has_a_kid wants_kids",
                                                             "has kids, and wants more": "has_kids wants_kids"})

ok_cupid_df['offspring'].value_counts()

unknown_offspring                    35558
doesnt_have_kids                      7559
doesnt_have_kids might_want_kids      3875
doesnt_have_kids wants_kids           3565
doesnt_want_kids                      2927
has_kids                              1883
has_a_kid                             1881
doesnt_have_kids doesnt_want_kids     1132
has_kids doesnt_want_kids              442
has_a_kid doesnt_want_kids             275
has_a_kid might_want_kids              231
wants_kids                             225
might_want_kids                        181
has_kids might_want_kids               115
has_a_kid wants_kids                    71
has_kids wants_kids                     21
Name: offspring, dtype: int64

# 1. Instantiate 
offspring = CountVectorizer()

# 2. Fit 
offspring.fit(ok_cupid_df["offspring"])

# 3. Transform
offspring_transformed = offspring.transform(ok_cupid_df["offspring"])
offspring_transformed

<59941x7 sparse matrix of type '<class 'numpy.int64'>'
	with 69668 stored elements in Compressed Sparse Row format>

offspring_df = pd.DataFrame(columns=offspring.get_feature_names(), data=offspring_transformed.toarray())
offspring_df_copy = offspring_df.copy()

# Drop one column to prevent redundant information
offspring_df = offspring_df.drop(columns='unknown_offspring')
offspring_df.head(2)

	doesnt_have_kids	doesnt_want_kids	has_a_kid	has_kids	might_want_kids	wants_kids
0	1	0	0	0	1	0
1	1	0	0	0	1	0

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, offspring_df], axis=1)
ok_cupid_df.drop(columns='offspring', inplace=True)
ok_cupid_df.head(2)

	age	height	income	last_online	pets	religion	sign	speaks	essay	essay_len	...	switzerland	united kingdom	united states	vietnam	doesnt_have_kids	doesnt_want_kids	has_a_kid	has_kids	might_want_kids	wants_kids
0	22	75.0	-1	2012-06-28-20-30	likes dogs and likes cats	agnosticism and very serious about it	gemini	english	about me i would love to think that i was som...	2389	...	0	0	1	0	1	0	0	0	1	0
1	35	70.0	80000	2012-06-29-21-41	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	...	0	0	1	0	1	0	0	0	1	0

2 rows × 412 columns

Pets:

ok_cupid_df['pets'].value_counts()

unknown_pets                       19917
likes dogs and likes cats          14814
likes dogs                          7224
likes dogs and has cats             4313
has dogs                            4134
has dogs and likes cats             2333
likes dogs and dislikes cats        2029
has dogs and has cats               1474
has cats                            1406
likes cats                          1062
has dogs and dislikes cats           552
dislikes dogs and likes cats         240
dislikes dogs and dislikes cats      196
dislikes cats                        122
dislikes dogs and has cats            81
dislikes dogs                         44
Name: pets, dtype: int64

ok_cupid_df['pets'] = ok_cupid_df['pets'].replace({'likes dogs and likes cats': 'likes_dogs likes_cats',
                                                   'likes dogs': 'likes_dogs',
                                                   'likes dogs and has cats': 'likes_dogs has_cats',
                                                   'has dogs': 'has_dogs',
                                                   'has dogs and likes cats': 'has_dogs likes_cats',
                                                   'likes dogs and dislikes cats': 'likes_dogs dislikes_cats',
                                                   'has dogs and has cats': 'has_dogs has_cats',
                                                   'has cats': 'has_cats',
                                                   'likes cats': 'likes_cats',
                                                   'has dogs and dislikes cats': 'has_dogs dislikes_cats',
                                                   'dislikes dogs and likes cats': 'dislikes_dogs likes_cats',
                                                   'dislikes dogs and dislikes cats': 'dislikes_dogs dislikes_cats',
                                                   'dislikes cats': 'dislikes_cats',
                                                   'dislikes dogs and has cats': 'dislikes_dogs has_cats',
                                                   'dislikes dogs': 'dislikes_dogs'})

ok_cupid_df['pets'].value_counts()

unknown_pets                   19917
likes_dogs likes_cats          14814
likes_dogs                      7224
likes_dogs has_cats             4313
has_dogs                        4134
has_dogs likes_cats             2333
likes_dogs dislikes_cats        2029
has_dogs has_cats               1474
has_cats                        1406
likes_cats                      1062
has_dogs dislikes_cats           552
dislikes_dogs likes_cats         240
dislikes_dogs dislikes_cats      196
dislikes_cats                    122
dislikes_dogs has_cats            81
dislikes_dogs                     44
Name: pets, dtype: int64

# 1. Instantiate 
pets = CountVectorizer()

# 2. Fit 
pets.fit(ok_cupid_df["pets"])

# 3. Transform
pets_transformed = pets.transform(ok_cupid_df["pets"])
pets_transformed

<59941x7 sparse matrix of type '<class 'numpy.int64'>'
	with 85973 stored elements in Compressed Sparse Row format>

pets_df = pd.DataFrame(columns=pets.get_feature_names(), data=pets_transformed.toarray())
pets_df_copy = pets_df.copy()

# Drop one column to prevent redundant information
pets_df = pets_df.drop(columns='unknown_pets')

pets_df.head(2)

	dislikes_cats	dislikes_dogs	has_cats	has_dogs	likes_cats	likes_dogs
0	0	0	0	0	1	1
1	0	0	0	0	1	1

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, pets_df], axis=1)
ok_cupid_df.drop(columns='pets', inplace=True)
ok_cupid_df.head(2)

	age	height	income	last_online	religion	sign	speaks	essay	essay_len	male	...	has_a_kid	has_kids	might_want_kids	wants_kids	dislikes_cats	dislikes_dogs	has_cats	has_dogs	likes_cats	likes_dogs
0	22	75.0	-1	2012-06-28-20-30	agnosticism and very serious about it	gemini	english	about me i would love to think that i was som...	2389	1	...	0	0	1	0	0	0	0	0	1	1
1	35	70.0	80000	2012-06-29-21-41	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	...	0	0	1	0	0	0	0	0	1	1

2 rows × 417 columns

Religion:

ok_cupid_df['religion'].value_counts()

unknown_religion                              20222
agnosticism                                    2724
other                                          2691
agnosticism but not too serious about it       2636
agnosticism and laughing about it              2496
catholicism but not too serious about it       2318
atheism                                        2175
other and laughing about it                    2119
atheism and laughing about it                  2074
christianity                                   1957
christianity but not too serious about it      1952
other but not too serious about it             1554
judaism but not too serious about it           1517
atheism but not too serious about it           1318
catholicism                                    1064
christianity and somewhat serious about it      927
atheism and somewhat serious about it           848
other and somewhat serious about it             845
catholicism and laughing about it               726
judaism and laughing about it                   681
buddhism but not too serious about it           650
agnosticism and somewhat serious about it       642
judaism                                         612
christianity and very serious about it          578
atheism and very serious about it               570
catholicism and somewhat serious about it       548
other and very serious about it                 533
buddhism and laughing about it                  466
buddhism                                        403
christianity and laughing about it              373
buddhism and somewhat serious about it          359
agnosticism and very serious about it           314
judaism and somewhat serious about it           266
hinduism but not too serious about it           227
hinduism                                        107
catholicism and very serious about it           102
buddhism and very serious about it               70
hinduism and somewhat serious about it           58
islam                                            48
hinduism and laughing about it                   44
islam but not too serious about it               40
islam and somewhat serious about it              22
judaism and very serious about it                22
islam and laughing about it                      16
hinduism and very serious about it               14
islam and very serious about it                  13
Name: religion, dtype: int64

ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace(' ', '')
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace('other', 'otherreligion')
ok_cupid_df['religion'].value_counts()

unknown_religion                          20222
agnosticism                                2724
otherreligion                              2691
agnosticismbutnottooseriousaboutit         2636
agnosticismandlaughingaboutit              2496
catholicismbutnottooseriousaboutit         2318
atheism                                    2175
otherreligionandlaughingaboutit            2119
atheismandlaughingaboutit                  2074
christianity                               1957
christianitybutnottooseriousaboutit        1952
otherreligionbutnottooseriousaboutit       1554
judaismbutnottooseriousaboutit             1517
atheismbutnottooseriousaboutit             1318
catholicism                                1064
christianityandsomewhatseriousaboutit       927
atheismandsomewhatseriousaboutit            848
otherreligionandsomewhatseriousaboutit      845
catholicismandlaughingaboutit               726
judaismandlaughingaboutit                   681
buddhismbutnottooseriousaboutit             650
agnosticismandsomewhatseriousaboutit        642
judaism                                     612
christianityandveryseriousaboutit           578
atheismandveryseriousaboutit                570
catholicismandsomewhatseriousaboutit        548
otherreligionandveryseriousaboutit          533
buddhismandlaughingaboutit                  466
buddhism                                    403
christianityandlaughingaboutit              373
buddhismandsomewhatseriousaboutit           359
agnosticismandveryseriousaboutit            314
judaismandsomewhatseriousaboutit            266
hinduismbutnottooseriousaboutit             227
hinduism                                    107
catholicismandveryseriousaboutit            102
buddhismandveryseriousaboutit                70
hinduismandsomewhatseriousaboutit            58
islam                                        48
hinduismandlaughingaboutit                   44
islambutnottooseriousaboutit                 40
islamandsomewhatseriousaboutit               22
judaismandveryseriousaboutit                 22
islamandlaughingaboutit                      16
hinduismandveryseriousaboutit                14
islamandveryseriousaboutit                   13
Name: religion, dtype: int64

religion = CountVectorizer()

religion_transformed = religion.fit_transform(ok_cupid_df["religion"])
religion_transformed

<59941x46 sparse matrix of type '<class 'numpy.int64'>'
	with 59941 stored elements in Compressed Sparse Row format>

religion_df = pd.DataFrame(columns=religion.get_feature_names(), data=religion_transformed.toarray())

# Drop one column to prevent redundant information
religion_df = religion_df.drop(columns=['unknown_religion'])
religion_df.head(2)

	agnosticism	agnosticismandlaughingaboutit	agnosticismandsomewhatseriousaboutit	agnosticismandveryseriousaboutit	agnosticismbutnottooseriousaboutit	atheism	atheismandlaughingaboutit	atheismandsomewhatseriousaboutit	atheismandveryseriousaboutit	atheismbutnottooseriousaboutit	...	judaism	judaismandlaughingaboutit	judaismandsomewhatseriousaboutit	judaismandveryseriousaboutit	judaismbutnottooseriousaboutit	otherreligion	otherreligionandlaughingaboutit	otherreligionandsomewhatseriousaboutit	otherreligionandveryseriousaboutit	otherreligionbutnottooseriousaboutit
0	0	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 45 columns

rel = religion_df.columns.tolist()

for r in rel:
    if r.endswith('andveryseriousaboutit'):
        rel.remove(r)

for r in rel:
    if r.endswith('andlaughingaboutit'):
        rel.remove(r)    
        
for r in rel:
    if r.endswith('andsomewhatseriousaboutit'):
        rel.remove(r)
        
for r in rel:
    if r.endswith('butnottooseriousaboutit'):
        rel.remove(r)

religions = rel

religions_serious = []
for religion in religions:
    religions_serious.append(religion+'andveryseriousaboutit')
    
religions_laughing = []
for religion in religions:
    religions_laughing.append(religion+'andlaughingaboutit')
    
religions_somewhat = []
for religion in religions:
    religions_somewhat.append(religion+'andsomewhatseriousaboutit')
    
religions_not_serious = []
for religion in religions:
    religions_not_serious.append(religion+'butnottooseriousaboutit')

for religion in religions:
    if religion in religion_df.columns:
        religion_df.loc[religion_df[religion] == 1, religion] = 4
        
for religion, religion_serious in zip(religions, religions_serious):
    if religion_serious in religion_df.columns:
        religion_df.loc[religion_df[religion_serious] == 1, religion] = 4
        religion_df = religion_df.drop(columns=religion_serious)

for religion, religion_laughing in zip(religions, religions_laughing):
    if religion_laughing in religion_df.columns:
        religion_df.loc[religion_df[religion_laughing] == 1, religion] = 1
        religion_df = religion_df.drop(columns=religion_laughing)
        
for religion, religion_somewhat in zip(religions, religions_somewhat):
    if religion_somewhat in religion_df.columns:
        religion_df.loc[religion_df[religion_somewhat] == 1, religion] = 3
        religion_df = religion_df.drop(columns=religion_somewhat)
        
for religion, religion_not_serious in zip(religions, religions_not_serious):
    if religion_not_serious in religion_df.columns:
        religion_df.loc[religion_df[religion_not_serious] == 1, religion] = 2
        religion_df = religion_df.drop(columns=religion_not_serious)

religion_df.head(2)

	agnosticism	atheism	buddhism	catholicism	christianity	hinduism	islam	judaism	otherreligion
0	4	0	0	0	0	0	0	0	0
1	2	0	0	0	0	0	0	0	0

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, religion_df], axis=1)
ok_cupid_df.drop(columns='religion', inplace=True)
ok_cupid_df.head(2)

	age	height	income	last_online	sign	speaks	essay	essay_len	male	is_religious	...	likes_dogs	agnosticism	atheism	buddhism	catholicism	christianity	hinduism	islam	judaism	otherreligion
0	22	75.0	-1	2012-06-28-20-30	gemini	english	about me i would love to think that i was som...	2389	1	0	...	1	4	0	0	0	0	0	0	0	0
1	35	70.0	80000	2012-06-29-21-41	cancer	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	0	...	1	2	0	0	0	0	0	0	0	0

2 rows × 425 columns

Sign:

ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace('&', '').str.replace('rsquo;','')
ok_cupid_df['sign'].value_counts()

unknown_sign                              11053
gemini and its fun to think about          1782
scorpio and its fun to think about         1772
leo and its fun to think about             1692
libra and its fun to think about           1649
taurus and its fun to think about          1640
cancer and its fun to think about          1597
pisces and its fun to think about          1592
sagittarius and its fun to think about     1583
virgo and its fun to think about           1574
aries and its fun to think about           1573
aquarius and its fun to think about        1503
virgo but it doesnt matter                 1497
leo but it doesnt matter                   1457
cancer but it doesnt matter                1454
gemini but it doesnt matter                1453
taurus but it doesnt matter                1450
libra but it doesnt matter                 1408
aquarius but it doesnt matter              1407
capricorn and its fun to think about       1376
sagittarius but it doesnt matter           1375
aries but it doesnt matter                 1373
capricorn but it doesnt matter             1319
pisces but it doesnt matter                1300
scorpio but it doesnt matter               1264
leo                                        1159
libra                                      1098
cancer                                     1092
virgo                                      1029
scorpio                                    1020
gemini                                     1013
taurus                                     1001
aries                                       995
pisces                                      992
aquarius                                    954
sagittarius                                 937
capricorn                                   833
scorpio and it matters a lot                 78
leo and it matters a lot                     66
cancer and it matters a lot                  63
aquarius and it matters a lot                63
pisces and it matters a lot                  62
gemini and it matters a lot                  62
libra and it matters a lot                   52
taurus and it matters a lot                  49
aries and it matters a lot                   47
sagittarius and it matters a lot             47
capricorn and it matters a lot               45
virgo and it matters a lot                   41
Name: sign, dtype: int64

ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace(' ', '')
ok_cupid_df['sign'].value_counts()

unknown_sign                        11053
geminianditsfuntothinkabout          1782
scorpioanditsfuntothinkabout         1772
leoanditsfuntothinkabout             1692
libraanditsfuntothinkabout           1649
taurusanditsfuntothinkabout          1640
canceranditsfuntothinkabout          1597
piscesanditsfuntothinkabout          1592
sagittariusanditsfuntothinkabout     1583
virgoanditsfuntothinkabout           1574
ariesanditsfuntothinkabout           1573
aquariusanditsfuntothinkabout        1503
virgobutitdoesntmatter               1497
leobutitdoesntmatter                 1457
cancerbutitdoesntmatter              1454
geminibutitdoesntmatter              1453
taurusbutitdoesntmatter              1450
librabutitdoesntmatter               1408
aquariusbutitdoesntmatter            1407
capricornanditsfuntothinkabout       1376
sagittariusbutitdoesntmatter         1375
ariesbutitdoesntmatter               1373
capricornbutitdoesntmatter           1319
piscesbutitdoesntmatter              1300
scorpiobutitdoesntmatter             1264
leo                                  1159
libra                                1098
cancer                               1092
virgo                                1029
scorpio                              1020
gemini                               1013
taurus                               1001
aries                                 995
pisces                                992
aquarius                              954
sagittarius                           937
capricorn                             833
scorpioanditmattersalot                78
leoanditmattersalot                    66
canceranditmattersalot                 63
aquariusanditmattersalot               63
piscesanditmattersalot                 62
geminianditmattersalot                 62
libraanditmattersalot                  52
taurusanditmattersalot                 49
ariesanditmattersalot                  47
sagittariusanditmattersalot            47
capricornanditmattersalot              45
virgoanditmattersalot                  41
Name: sign, dtype: int64

sign = CountVectorizer()

sign_transformed = sign.fit_transform(ok_cupid_df["sign"])
sign_transformed

<59941x49 sparse matrix of type '<class 'numpy.int64'>'
	with 59941 stored elements in Compressed Sparse Row format>

sign_df = pd.DataFrame(columns=sign.get_feature_names(), data=sign_transformed.toarray())

# Drop one column to prevent redundant information
sign_df = sign_df.drop(columns=['unknown_sign'])
sign_df.head(2)

	aquarius	aquariusanditmattersalot	aquariusanditsfuntothinkabout	aquariusbutitdoesntmatter	aries	ariesanditmattersalot	ariesanditsfuntothinkabout	ariesbutitdoesntmatter	cancer	canceranditmattersalot	...	scorpioanditsfuntothinkabout	scorpiobutitdoesntmatter	taurus	taurusanditmattersalot	taurusanditsfuntothinkabout	taurusbutitdoesntmatter	virgo	virgoanditmattersalot	virgoanditsfuntothinkabout	virgobutitdoesntmatter
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 48 columns

sig = sign_df.columns.tolist()

for s in sig:
    if s.endswith('butitdoesntmatter'):
        sig.remove(s)

for s in sig:
    if s.endswith('anditmattersalot'):
        sig.remove(s)    
        
for s in sig:
    if s.endswith('anditsfuntothinkabout'):
        sig.remove(s)

signs = sig

signs_doesntmatter = []
for sign in signs:
    signs_doesntmatter.append(sign+'butitdoesntmatter')
    
signs_matters = []
for sign in signs:
    signs_matters.append(sign+'anditmattersalot')
    
signs_fun = []
for sign in signs:
    signs_fun.append(sign+'anditsfuntothinkabout')

for sign in signs:
    if sign in sign_df.columns:
        sign_df.loc[sign_df[sign] == 1, sign] = 3
        
for sign, sign_doesntmatter in zip(signs, signs_doesntmatter):
    if sign_doesntmatter in sign_df.columns:
        sign_df.loc[sign_df[sign_doesntmatter] == 1, sign] = 2
        sign_df = sign_df.drop(columns=sign_doesntmatter)

for sign, sign_matters in zip(signs, signs_matters):
    if sign_matters in sign_df.columns:
        sign_df.loc[sign_df[sign_matters] == 1, sign] = 3
        sign_df = sign_df.drop(columns=sign_matters)
        
for sign, sign_fun in zip(signs, signs_fun):
    if sign_fun in sign_df.columns:
        sign_df.loc[sign_df[sign_fun] == 1, sign] = 1
        sign_df = sign_df.drop(columns=sign_fun)

sign_df.head(2)

	aquarius	aries	cancer	capricorn	gemini	leo	libra	pisces	sagittarius	scorpio	taurus	virgo
0	0	0	0	0	3	0	0	0	0	0	0	0
1	0	0	3	0	0	0	0	0	0	0	0	0

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, sign_df], axis=1)
ok_cupid_df.drop(columns='sign', inplace=True)
ok_cupid_df.head(2)

	age	height	income	last_online	speaks	essay	essay_len	male	is_religious	is_agnostic	...	cancer	capricorn	gemini	leo	libra	pisces	sagittarius	scorpio	taurus	virgo
0	22	75.0	-1	2012-06-28-20-30	english	about me i would love to think that i was som...	2389	1	0	1	...	0	0	3	0	0	0	0	0	0	0
1	35	70.0	80000	2012-06-29-21-41	english (fluently), spanish (poorly), french (...	i am a chef this is what that means 1 i am a w...	1340	1	0	1	...	3	0	0	0	0	0	0	0	0	0

2 rows × 436 columns

Speaks (languages):

ok_cupid_df['speaks'] = ok_cupid_df['speaks'].str.replace(' ', '').str.replace('(', '_').str.replace(')', '').str.replace(',', ' ').str.replace('_', '')
ok_cupid_df['speaks'].value_counts()

english                                                                      21826
englishfluently                                                               6627
englishfluently spanishpoorly                                                 2059
englishfluently spanishokay                                                   1917
englishfluently spanishfluently                                               1288
                                                                             ...  
englishfluently norwegianfluently swedishokay germanokay                         1
english spanish portuguese thai                                                  1
englishfluently frenchokay bulgarianpoorly chechenpoorly chinesepoorly           1
englishfluently chinesepoorly vietnamesepoorly japanesepoorly spanishokay        1
englishfluently chinesefluently japanesepoorly c++fluently otherfluently         1
Name: speaks, Length: 7648, dtype: int64

speaks = CountVectorizer()

speaks_transformed = speaks.fit_transform(ok_cupid_df["speaks"])
speaks_transformed

<59941x302 sparse matrix of type '<class 'numpy.int64'>'
	with 110527 stored elements in Compressed Sparse Row format>

speaks_df = pd.DataFrame(columns=speaks.get_feature_names(), data=speaks_transformed.toarray())

# Drop one column to prevent redundant information
speaks_df = speaks_df.drop(columns=['unknownspeaks', 'poorly', 'fluently', 'okay', 'lisp', 'lispokay', 'lisppoorly', 'lispfluently'])
speaks_df.head(2)

	afrikaans	afrikaansfluently	afrikaansokay	afrikaanspoorly	albanian	albanianfluently	albanianokay	albanianpoorly	ancientgreek	ancientgreekfluently	...	vietnameseokay	vietnamesepoorly	welsh	welshfluently	welshokay	welshpoorly	yiddish	yiddishfluently	yiddishokay	yiddishpoorly
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 294 columns

speaks_df = speaks_df.rename(columns={'armenianfluently': 'armenian', 'slovenianfluently': 'slovenian', 'sardinianfluently': 'sardinian'})

lang = speaks_df.columns.tolist()

for l in lang:
    if l.endswith('fluently'):
        lang.remove(l)

for l in lang:
    if l.endswith('poorly'):
        lang.remove(l)    
        
for l in lang:
    if l.endswith('okay'):
        lang.remove(l)

languages = lang

languages_fluently = []
for language in languages:
    languages_fluently.append(language+'fluently')
    
languages_okay = []
for language in languages:
    languages_okay.append(language+'okay')
    
languages_poorly = []
for language in languages:
    languages_poorly.append(language+'poorly')

for language in languages:
    if language in speaks_df.columns:
        speaks_df.loc[speaks_df[language] == 1, language] = 3
        
for language, language_fluent in zip(languages, languages_fluently):
    if language_fluent in speaks_df.columns:
        speaks_df.loc[speaks_df[language_fluent] == 1, language] = 3
        speaks_df = speaks_df.drop(columns=language_fluent)

for language, language_okay in zip(languages, languages_okay):
    if language_okay in speaks_df.columns:
        speaks_df.loc[speaks_df[language_okay] == 1, language] = 2
        #speaks_df.loc[speaks_df['slovenian'] == 1, 'slovenian'] = 2
        speaks_df = speaks_df.drop(columns=language_okay)
        
for language, language_poorly in zip(languages, languages_poorly):
    if language_poorly in speaks_df.columns:
        speaks_df.loc[speaks_df[language_poorly] == 1, language] = 1
        speaks_df = speaks_df.drop(columns=language_poorly)

speaks_df.head(2)

	afrikaans	albanian	ancientgreek	arabic	armenian	basque	belarusan	bengali	breton	bulgarian	...	tagalog	tamil	thai	tibetan	turkish	ukrainian	urdu	vietnamese	welsh	yiddish
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 75 columns

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, speaks_df], axis=1)
ok_cupid_df.drop(columns='speaks', inplace=True)
ok_cupid_df.head(2)

	age	height	income	last_online	essay	essay_len	male	is_religious	is_agnostic	is_atheist	...	tagalog	tamil	thai	tibetan	turkish	ukrainian	urdu	vietnamese	welsh	yiddish
0	22	75.0	-1	2012-06-28-20-30	about me i would love to think that i was som...	2389	1	0	1	0	...	0	0	0	0	0	0	0	0	0	0
1	35	70.0	80000	2012-06-29-21-41	i am a chef this is what that means 1 i am a w...	1340	1	0	1	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 510 columns

Drop income:

(too many null values)

ok_cupid_df.drop(columns='income', inplace=True)

Last online:

ok_cupid_df['last_online']

      2012-06-28-20-30
      2012-06-29-21-41
      2012-06-27-09-10
      2012-06-28-14-22
      2012-06-27-21-26
               ...       
  2012-06-12-21-47
  2012-06-29-11-01
  2012-06-27-23-37
  2012-06-23-13-01
  2012-06-29-00-42
Name: last_online, Length: 59941, dtype: object

ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0].astype('int')
ok_cupid_df['last_online_month'] = ok_cupid_df['last_online'].str.split("-", expand=True)[1].astype('int')
last_online_datetime = (ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]).astype('datetime64')
ok_cupid_df['last_online_weekday'] = last_online_datetime.dt.weekday
ok_cupid_df['last_online_weekday'] = np.where(ok_cupid_df['last_online_weekday'] < 5, 1, 0)
ok_cupid_df['last_online_weekday']

      1
      1
      1
      1
      1
        ..
  1
  1
  1
  0
  1
Name: last_online_weekday, Length: 59941, dtype: int64

#ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0]
ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]

      2012-06-28
      2012-06-29
      2012-06-27
      2012-06-28
      2012-06-27
            ...    
  2012-06-12
  2012-06-29
  2012-06-27
  2012-06-23
  2012-06-29
Length: 59941, dtype: object

ok_cupid_df.drop(columns=['last_online', 'is_agnostic'], inplace=True)
ok_cupid_df.head(2)

	age	height	essay	essay_len	male	is_religious	is_atheist	is_straight	gay	straight	...	tibetan	turkish	ukrainian	urdu	vietnamese	welsh	yiddish	last_online_year	last_online_month	last_online_weekday
0	22	75.0	about me i would love to think that i was som...	2389	1	0	0	1	0	1	...	0	0	0	0	0	0	0	2012	6	1
1	35	70.0	i am a chef this is what that means 1 i am a w...	1340	1	0	0	1	0	1	...	0	0	0	0	0	0	0	2012	6	1

2 rows × 510 columns

ok_cupid_df = ok_cupid_df.drop(columns=['essay'])

ok_cupid_df.head(2)

	age	height	essay_len	male	is_religious	is_atheist	is_straight	gay	straight	available	...	tibetan	turkish	ukrainian	urdu	vietnamese	welsh	yiddish	last_online_year	last_online_month	last_online_weekday
0	22	75.0	2389	1	0	0	1	0	1	0	...	0	0	0	0	0	0	0	2012	6	1
1	35	70.0	1340	1	0	0	1	0	1	0	...	0	0	0	0	0	0	0	2012	6	1

2 rows × 509 columns

ok_cupid_df.isna().sum().sum()

ok_cupid_df.to_csv (r'data/okcupid_profiles_clean.csv', index = False, header=True)

Written on March 3, 2014