project

Testing

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
import plotly.graph_objects as go
ok_cupid_df = pd.read_csv('data/okcupid_profiles.csv')
ok_cupid_df.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')
ok_cupid_df.shape
(59946, 31)
ok_cupid_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smokes       54434 non-null  object 
 20  speaks       59896 non-null  object 
 21  essay0       54458 non-null  object 
 22  essay1       52374 non-null  object 
 23  essay2       50308 non-null  object 
 24  essay3       48470 non-null  object 
 25  essay4       49409 non-null  object 
 26  essay5       49096 non-null  object 
 27  essay6       46175 non-null  object 
 28  essay7       47495 non-null  object 
 29  essay8       40721 non-null  object 
 30  essay9       47343 non-null  object 
dtypes: float64(1), int64(2), object(28)
memory usage: 14.2+ MB
ok_cupid_df.head(2)
age status sex orientation body_type diet drinks drugs education ethnicity ... essay0 essay1 essay2 essay3 essay4 essay5 essay6 essay7 essay8 essay9
0 22 single m straight a little extra strictly anything socially never working on college/university asian, white ... about me: i would love to think that i was so... currently working as an international agent fo... making people laugh. ranting about a good salt... the way i look. i am a six foot half asian, ha... books: absurdistan, the republic, of mice and ... food. water. cell phone. shelter. duality and humorous things trying to find someone to hang out with. i am ... i am new to california and looking for someone... you want to be swept off your feet! you are ti...
1 35 single m straight average mostly other often sometimes working on space camp white ... i am a chef: this is what that means. 1. i am ... dedicating everyday to being an unbelievable b... being silly. having ridiculous amonts of fun w... NaN i am die hard christopher moore fan. i don't r... delicious porkness in all of its glories. my b... NaN NaN i am very open and will share just about anyth... NaN

2 rows × 31 columns

Check null values:

ok_cupid_df.isna().sum()
age                0
status             0
sex                0
orientation        0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
pets           19921
religion       20226
sign           11056
smokes          5512
speaks            50
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64
ok_cupid_df.isna().sum()/ok_cupid_df.shape[0]*100
age             0.000000
status          0.000000
sex             0.000000
orientation     0.000000
body_type       8.834618
diet           40.694959
drinks          4.979482
drugs          23.487806
education      11.056618
ethnicity       9.475194
height          0.005005
income          0.000000
job            13.675641
last_online     0.000000
location        0.000000
offspring      59.321723
pets           33.231575
religion       33.740366
sign           18.443266
smokes          9.194942
speaks          0.083408
essay0          9.154906
essay1         12.631368
essay2         16.077803
essay3         19.143896
essay4         17.577486
essay5         18.099623
essay6         22.972342
essay7         20.770360
essay8         32.070530
essay9         21.023922
dtype: float64

Fill null values:

ok_cupid_df['drugs'] = ok_cupid_df['drugs'].fillna('unknown_drugs')
ok_cupid_df['drugs'].value_counts()
never            37724
unknown_drugs    14080
sometimes         7732
often              410
Name: drugs, dtype: int64
ok_cupid_df['diet'] = ok_cupid_df['diet'].fillna('unknowndiet')
ok_cupid_df['diet'].value_counts()
unknowndiet            24395
mostly anything        16585
anything                6183
strictly anything       5113
mostly vegetarian       3444
mostly other            1007
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    136
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
halal                     11
kosher                    11
Name: diet, dtype: int64
#ok_cupid_df.loc[(ok_cupid_df['diet'] == 'unknown')&(ok_cupid_df['essay0'] == '57'), 'status'] = 'available'
ok_cupid_df.loc[ok_cupid_df['essay0'] == "im looking for someone to share some raging adhd. im a self motivated and light hearted superhero who enjoy's riding my bike everywhere and eating every goddamn thing i can.  im looking for someone to go adventuring with. i enjoy blind drunken adventures sometimes but you dont have to be a drinker. no vegans, i will eat anything... including people... especially hipsters. im not really a nerd (i don't play magic cards/excessive videogames) but i can like nerdy girls.  i just got this account, so gimmie some time to write down more shenanigans that are important  if u make chiptunes hit me the fuck up! i wanna make some!  i am awesome, eccentric, and energetic", 'diet'] = 'strictly anything'
ok_cupid_df.loc[ok_cupid_df['essay0'] == "rabid bibliophile, humorless feminist (that's a joke), eternal student. i like to write poetry on people, bake (vegan) cupcakes, make art and dress-up.  i identify as queer but my choices here are limited so i chose bisexual.  i am quiet, empathetic, and geeky", 'diet'] = 'vegan'
#ok_cupid_df[ok_cupid_df['diet'] == 'unknown_diet']['essay0'].tolist()
ok_cupid_df['status'].value_counts()
single            55697
seeing someone     2064
available          1865
married             310
unknown              10
Name: status, dtype: int64
ok_cupid_df["status"].replace({'unknown': 'unknown_status'}, inplace=True)
ok_cupid_df['status'].value_counts()
single            55697
seeing someone     2064
available          1865
married             310
unknown_status       10
Name: status, dtype: int64
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].fillna('unknown_body_type')
ok_cupid_df['body_type'].value_counts()
average              14652
fit                  12711
athletic             11819
unknown_body_type     5296
thin                  4711
curvy                 3924
a little extra        2629
skinny                1777
full figured          1009
overweight             444
jacked                 421
used up                355
rather not say         198
Name: body_type, dtype: int64
ok_cupid_df['education'] = ok_cupid_df['education'].fillna('unknown_education')
ok_cupid_df['education'].value_counts()
graduated from college/university    23959
graduated from masters program        8961
unknown_education                     6628
working on college/university         5712
working on masters program            1683
graduated from two-year college       1531
graduated from high school            1428
graduated from ph.d program           1272
graduated from law school             1122
working on two-year college           1074
dropped out of college/university      995
working on ph.d program                983
college/university                     801
graduated from space camp              657
dropped out of space camp              523
graduated from med school              446
working on space camp                  445
working on law school                  269
two-year college                       222
working on med school                  212
dropped out of two-year college        191
dropped out of masters program         140
masters program                        136
dropped out of ph.d program            127
dropped out of high school             102
high school                             96
working on high school                  87
space camp                              58
ph.d program                            26
law school                              19
dropped out of law school               18
dropped out of med school               12
med school                              11
Name: education, dtype: int64
ok_cupid_df['job'] = ok_cupid_df['job'].fillna('unknown_job')
#ok_cupid_df['job'].value_counts()
ok_cupid_df['ethnicity'] = ok_cupid_df['ethnicity'].fillna('unknown_ethnicity')
#ok_cupid_df['ethnicity'].value_counts()
ok_cupid_df['offspring'] = ok_cupid_df['offspring'].fillna('unknown_offspring')
#ok_cupid_df['offspring'].value_counts()
ok_cupid_df['pets'] = ok_cupid_df['pets'].fillna('unknown_pets')
#ok_cupid_df['pets'].value_counts()
ok_cupid_df['religion'] = ok_cupid_df['religion'].fillna('unknown_religion')
#ok_cupid_df['religion'].value_counts()
ok_cupid_df['sign'] = ok_cupid_df['sign'].fillna('unknown_sign')
#ok_cupid_df['sign'].value_counts()
ok_cupid_df['smokes'] = ok_cupid_df['smokes'].fillna('unknown_smokes')
ok_cupid_df['smokes'].value_counts()
no                43896
unknown_smokes     5512
sometimes          3787
when drinking      3040
yes                2231
trying to quit     1480
Name: smokes, dtype: int64
ok_cupid_df['drinks'] = ok_cupid_df['drinks'].fillna('unknown_drinks')
ok_cupid_df['drinks'].value_counts()
socially          41780
rarely             5957
often              5164
not at all         3267
unknown_drinks     2985
very often          471
desperately         322
Name: drinks, dtype: int64
ok_cupid_df['speaks'] = ok_cupid_df['speaks'].fillna('unknown_speaks')
ok_cupid_df['speaks'].value_counts()
english                                                                                           21828
english (fluently)                                                                                 6628
english (fluently), spanish (poorly)                                                               2059
english (fluently), spanish (okay)                                                                 1917
english (fluently), spanish (fluently)                                                             1288
                                                                                                  ...  
english (fluently), french (poorly), polish (poorly), latin (poorly), italian (poorly)                1
english (fluently), hebrew (fluently), yiddish (fluently)                                             1
english (fluently), spanish (okay), catalan (poorly), italian (poorly)                                1
english (fluently), c++ (fluently), bengali (okay), french (poorly)                                   1
english (fluently), ancient greek (okay), spanish (fluently), french (poorly), hebrew (poorly)        1
Name: speaks, Length: 7648, dtype: int64
ok_cupid_df[ok_cupid_df['height'].isna()]
age status sex orientation body_type diet drinks drugs education ethnicity ... essay0 essay1 essay2 essay3 essay4 essay5 essay6 essay7 essay8 essay9
36428 32 single f straight unknown_body_type unknowndiet unknown_drinks unknown_drugs unknown_education other ... NaN NaN NaN NaN thomas bernhard, foucault, annie hall, taxi dr... NaN consciousness NaN i passionately hate liars! you know what my user name means and if you ar...
54002 25 single m straight unknown_body_type unknowndiet unknown_drinks never unknown_education hispanic / latin ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
58983 49 single m straight unknown_body_type unknowndiet unknown_drinks unknown_drugs unknown_education unknown_ethnicity ... great guy, lots of positive attributes*, but s... living it. quite a bit more than that - more ... lots, notably good, deep, excellent communicat... some positive stuff, but i'll hold my tongue o... lots. not especially up to listing 'em here a... 1. damn good friend, or better 2. managing to ... many things. maybe too much. not really up for... at the moment, i'd rather not even say or thin... i have a blog of much that's personal and priv... you've good reason to think we'd like make at ...

3 rows × 31 columns

ok_cupid_df.isna().sum()
age                0
status             0
sex                0
orientation        0
body_type          0
diet               0
drinks             0
drugs              0
education          0
ethnicity          0
height             3
income             0
job                0
last_online        0
location           0
offspring          0
pets               0
religion           0
sign               0
smokes             0
speaks             0
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

Check for duplicalted rows:

ok_cupid_df[ok_cupid_df.duplicated()]
age status sex orientation body_type diet drinks drugs education ethnicity ... essay0 essay1 essay2 essay3 essay4 essay5 essay6 essay7 essay8 essay9

0 rows × 31 columns

Transform essays:

ok_cupid_df['essay0'] = ok_cupid_df['essay0'].fillna('')
ok_cupid_df['essay1'] = ok_cupid_df['essay1'].fillna('')
ok_cupid_df['essay2'] = ok_cupid_df['essay2'].fillna('')
ok_cupid_df['essay3'] = ok_cupid_df['essay3'].fillna('')
ok_cupid_df['essay4'] = ok_cupid_df['essay4'].fillna('')
ok_cupid_df['essay5'] = ok_cupid_df['essay5'].fillna('')
ok_cupid_df['essay6'] = ok_cupid_df['essay6'].fillna('')
ok_cupid_df['essay7'] = ok_cupid_df['essay7'].fillna('')
ok_cupid_df['essay8'] = ok_cupid_df['essay8'].fillna('')
ok_cupid_df['essay9'] = ok_cupid_df['essay9'].fillna('')
essay_columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
ok_cupid_df['essay'] = ok_cupid_df[essay_columns].apply(lambda row: ' '.join(row.values.astype(object)), axis = 1)
ok_cupid_df['essay'] = ok_cupid_df['essay'].str.replace('[^\w\s]','')
avg_words = pd.DataFrame()
avg_words['avg_words'] = ok_cupid_df['essay'].str.split().str.len()
avg_words.describe().T
count mean std min 25% 50% 75% max
avg_words 59946.0 353.896757 293.270595 0.0 158.0 296.0 477.0 10486.0
ok_cupid_df['essay_len'] = ok_cupid_df['essay'].str.len()
ok_cupid_df = ok_cupid_df.drop(columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'])
ok_cupid_df.head(2)
age status sex orientation body_type diet drinks drugs education ethnicity ... last_online location offspring pets religion sign smokes speaks essay essay_len
0 22 single m straight a little extra strictly anything socially never working on college/university asian, white ... 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini sometimes english about me i would love to think that i was som... 2389
1 35 single m straight average mostly other often sometimes working on space camp white ... 2012-06-29-21-41 oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer no english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340

2 rows × 23 columns

ok_cupid_df.isna().sum()
age            0
status         0
sex            0
orientation    0
body_type      0
diet           0
drinks         0
drugs          0
education      0
ethnicity      0
height         3
income         0
job            0
last_online    0
location       0
offspring      0
pets           0
religion       0
sign           0
smokes         0
speaks         0
essay          0
essay_len      0
dtype: int64

Drop 3 rows with null values:

ok_cupid_df.dropna(inplace=True)
ok_cupid_df[ok_cupid_df['age']>100]
age status sex orientation body_type diet drinks drugs education ethnicity ... last_online location offspring pets religion sign smokes speaks essay essay_len
2512 110 single f straight unknown_body_type unknowndiet unknown_drinks unknown_drugs unknown_education unknown_ethnicity ... 2012-06-27-22-16 daly city, california unknown_offspring unknown_pets unknown_religion unknown_sign unknown_smokes english 9
25324 109 available m straight athletic mostly other unknown_drinks never working on masters program unknown_ethnicity ... 2012-06-30-18-18 san francisco, california might want kids unknown_pets other and somewhat serious about it aquarius but it doesn&rsquo;t matter when drinking english (okay) nothing 16

2 rows × 23 columns

#ok_cupid_df = ok_cupid_df.drop(ok_cupid_df.index[[, 25324]])
ok_cupid_df = ok_cupid_df.drop(index = [2512, 25324])
ok_cupid_df[ok_cupid_df['age']>100]
age status sex orientation body_type diet drinks drugs education ethnicity ... last_online location offspring pets religion sign smokes speaks essay essay_len

0 rows × 23 columns

Check duplicated columns:

ok_cupid_df.columns.duplicated()
array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])
ok_cupid_df[ok_cupid_df.duplicated()]
age status sex orientation body_type diet drinks drugs education ethnicity ... last_online location offspring pets religion sign smokes speaks essay essay_len

0 rows × 23 columns

ok_cupid_df['income'].value_counts()
-1          48437
 20000       2952
 100000      1621
 80000       1111
 30000       1048
 40000       1005
 50000        975
 60000        736
 70000        707
 150000       631
 1000000      521
 250000       149
 500000        48
Name: income, dtype: int64
ok_cupid_df.shape
(59941, 23)
#for column in ok_cupid_df.columns:
    
#    plt.figure()
#    plt.hist(ok_cupid_df[column], bins=25)
#    plt.title(f'Histogram of {column}')
#    plt.show()

Add binary column for gender (and drop existing sex column):

valid_income_df = ok_cupid_df[ok_cupid_df['income'] != -1]
male_df_income = valid_income_df[valid_income_df['sex'] == 'm']
female_df_income = valid_income_df[valid_income_df['sex'] == 'f']
ok_cupid_df2 = ok_cupid_df.copy()
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['thin'],'skinny')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['fit'],'athletic')
ok_cupid_df2['body_type'] = ok_cupid_df2['body_type'].replace(['unknown_body_type'],'rather not say')
# Add new binary column for gender
ok_cupid_df["male"] = np.where(ok_cupid_df["sex"]=="m", 1, 0)
ok_cupid_df.head(2)
age status sex orientation body_type diet drinks drugs education ethnicity ... location offspring pets religion sign smokes speaks essay essay_len male
0 22 single m straight a little extra strictly anything socially never working on college/university asian, white ... south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini sometimes english about me i would love to think that i was som... 2389 1
1 35 single m straight average mostly other often sometimes working on space camp white ... oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer no english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1

2 rows × 24 columns

# Drop the sex column
ok_cupid_df.drop(columns="sex", inplace=True)
ok_cupid_df.head(2)
age status orientation body_type diet drinks drugs education ethnicity height ... location offspring pets religion sign smokes speaks essay essay_len male
0 22 single straight a little extra strictly anything socially never working on college/university asian, white 75.0 ... south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini sometimes english about me i would love to think that i was som... 2389 1
1 35 single straight average mostly other often sometimes working on space camp white 70.0 ... oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer no english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1

2 rows × 23 columns

ok_cupid_df.reset_index(drop=True, inplace=True)
ok_cupid_df2.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay', 'essay_len'],
      dtype='object')
valid_income_df['education'] = valid_income_df['education'].replace({'working on space camp': 'space camp', 
                                                                     'working on college/university': 'college/university', 
                                                                     'graduated from college/university': 'college/university',
                                                                     'graduated from high school': 'high school',
                                                                     'dropped out of space camp': 'space camp',
                                                                     'graduated from space camp': 'space camp',
                                                                     'graduated from law school': 'law school',
                                                                     'graduated from masters program': 'masters program',
                                                                     'graduated from two-year college': 'two-year college',
                                                                     'working on med school': 'med school',
                                                                     'dropped out of high school': 'high school',
                                                                     'working on ph.d program': 'ph.d program',
                                                                     'graduated from ph.d program': 'ph.d program',
                                                                     'dropped out of college/university': 'college/university',
                                                                     'dropped out of two-year college': 'two-year college',
                                                                     'dropped out of med school': 'med school',
                                                                     'working on masters program': 'masters program',
                                                                     'working on two-year college': 'two-year college',
                                                                     'working on high school': 'high school',
                                                                     'graduated from med school': 'med school',
                                                                     'dropped out of masters program': 'masters program',
                                                                     'working on law school': 'law school',
                                                                     'dropped out of ph.d program': 'ph.d program',
                                                                     'dropped out of law school': 'law school'})
male_df_income = valid_income_df[valid_income_df['sex'] == 'm']
female_df_income = valid_income_df[valid_income_df['sex'] == 'f']
means_male = []
medians_male = []
for education in valid_income_df['education'].unique():
    means_male.append(male_df_income[male_df_income['education'] == education]['income'].mean())
    medians_male.append(male_df_income[male_df_income['education'] == education]['income'].median())
    
list_of_tuples_male = list(zip(male_df_income['education'].unique(), means_male, medians_male))

df_mean_incomes_male = pd.DataFrame(list_of_tuples_male, columns=['education', 'mean_income', 'median_income'])
means_female = []
medians_female = []
for education in valid_income_df['education'].unique():
    means_female.append(female_df_income[female_df_income['education'] == education]['income'].mean())
    medians_female.append(female_df_income[female_df_income['education'] == education]['income'].median())
    
list_of_tuples_female = list(zip(female_df_income['education'].unique(), means_female, medians_female))

df_mean_incomes_female = pd.DataFrame(list_of_tuples_female, columns=['education', 'mean_income', 'median_income'])
df_mean_incomes_male_mean = df_mean_incomes_male.sort_values(by = 'mean_income')
df_mean_incomes_male_median = df_mean_incomes_male.sort_values(by = 'median_income')

df_mean_incomes_female_mean = df_mean_incomes_female.sort_values(by = 'mean_income')
df_mean_incomes_female_median = df_mean_incomes_female.sort_values(by = 'median_income')
#df_mean_incomes_male_mean
educations = valid_income_df['education'].unique()

fig = go.Figure(data=[
    go.Bar(name='Male', 
           y=educations, 
           x=df_mean_incomes_male_median['median_income'], 
           text=df_mean_incomes_male_median['median_income'], orientation='h'),
    go.Bar(name='Female', 
           y=educations, 
           x=df_mean_incomes_female_median['median_income'],
           text=df_mean_incomes_female_median['median_income'], orientation='h')
])

fig.update_layout(barmode='group',
    title='Median Income for each education',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Educations',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Median Income',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
fig = go.Figure()

fig.add_trace(go.Histogram(x=male_df_income['income'], name='Male'))
fig.add_trace(go.Histogram(x=female_df_income['income'], name='Female'))

fig.update_layout(barmode='overlay',
                  title='Distribution of Income for Males and Females',
                  #xaxis_tickfont_size=14,
                  yaxis=dict(title='Count',
                             titlefont_size=16,
                             tickfont_size=14),
                  xaxis=dict(title='Income',
                             titlefont_size=16,
                             tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()
labels = ['Male', 'Female']
values = ok_cupid_df2['sex'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=labels, 
                             values=values, 
                             hole=.5)])
fig.update_layout(title='Male vs Female')
fig.show()
Create new dataframes for Male, Female, and Orientations:
male_df = ok_cupid_df2[ok_cupid_df2['sex'] == 'm']
female_df = ok_cupid_df2[ok_cupid_df2['sex'] == 'f']
ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({'agnosticism and very serious about it': 'agnosticism',
'agnosticism but not too serious about it' : 'agnosticism', 
'agnosticism and somewhat serious about it': 'agnosticism',
'agnosticism and laughing about it': 'agnosticism','agnosticism': 'agnostic'})
ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({'atheism and laughing about it':'atheism',
'atheism and somewhat serious about it': 'atheism', 
'atheism but not too serious about it': 'atheism', 'atheism and very serious about it':'atheism', 'atheism': 'atheist'})
for religion in ok_cupid_df2['religion'].unique():
    if ((religion != 'atheist') and (religion != 'agnostic') and (religion != 'unknown_religion')):
        ok_cupid_df2['religion'] = ok_cupid_df2['religion'].replace({religion: 'religious'})
ok_cupid_df['religion_binary'] = ok_cupid_df['religion']

ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({'agnosticism and very serious about it': 'agnostic',
'agnosticism but not too serious about it' : 'agnostic', 
'agnosticism and somewhat serious about it': 'agnostic',
'agnosticism and laughing about it': 'agnosticism','agnosticism': 'agnostic'})

ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({'atheism and laughing about it':'atheism',
'atheism and somewhat serious about it': 'atheist', 
'atheism but not too serious about it': 'atheist', 'atheism and very serious about it':'atheist', 'atheism': 'atheist'})

for religion in ok_cupid_df['religion_binary'].unique():
    if ((religion != 'atheist') and (religion != 'agnostic') and (religion != 'unknown_religion')):
        ok_cupid_df['religion_binary'] = ok_cupid_df['religion_binary'].replace({religion: 'religious'})
ok_cupid_df2['religion'].value_counts()
religious           34820
unknown_religion    20222
agnostic             2724
atheist              2175
Name: religion, dtype: int64
ok_cupid_df['religion_binary'].value_counts()
religious           28492
unknown_religion    20222
agnostic             6316
atheist              4911
Name: religion_binary, dtype: int64
ok_cupid_df.head(2)
age status orientation body_type diet drinks drugs education ethnicity height ... offspring pets religion sign smokes speaks essay essay_len male religion_binary
0 22 single straight a little extra strictly anything socially never working on college/university asian, white 75.0 ... doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini sometimes english about me i would love to think that i was som... 2389 1 agnostic
1 35 single straight average mostly other often sometimes working on space camp white 70.0 ... doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer no english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 agnostic

2 rows × 24 columns

ok_cupid_df['is_religious'] = np.where(ok_cupid_df['religion_binary'] == 'religious', 1, 0)
ok_cupid_df['is_agnostic'] = np.where(ok_cupid_df['religion_binary'] == 'agnostic', 1, 0)
ok_cupid_df['is_atheist'] = np.where(ok_cupid_df['religion_binary'] == 'atheist', 1, 0)
ok_cupid_df.head(2)
age status orientation body_type diet drinks drugs education ethnicity height ... sign smokes speaks essay essay_len male religion_binary is_religious is_agnostic is_atheist
0 22 single straight a little extra strictly anything socially never working on college/university asian, white 75.0 ... gemini sometimes english about me i would love to think that i was som... 2389 1 agnostic 0 1 0
1 35 single straight average mostly other often sometimes working on space camp white 70.0 ... cancer no english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 agnostic 0 1 0

2 rows × 27 columns

straight_df = ok_cupid_df[ok_cupid_df['orientation'] == 'straight']
non_straight_df = ok_cupid_df[ok_cupid_df['orientation'].isin(['bisexual', 'gay'])]
non_straight_df.head(2)
age status orientation body_type diet drinks drugs education ethnicity height ... sign smokes speaks essay essay_len male religion_binary is_religious is_agnostic is_atheist
37 25 single bisexual fit mostly anything socially unknown_drugs working on college/university hispanic / latin, white 69.0 ... libra and it&rsquo;s fun to think about unknown_smokes english (fluently), spanish (poorly) lets go to a festival and dance all night runn... 3516 1 unknown_religion 0 0 0
44 29 single bisexual curvy anything socially sometimes graduated from masters program white 66.0 ... aquarius and it&rsquo;s fun to think about no english, spanish (poorly), portuguese (poorly) i am an east coast transplant looking for fun ... 2259 0 religious 1 0 0

2 rows × 27 columns

ok_cupid_df['is_straight'] = np.where(ok_cupid_df['orientation'] == 'straight', 1, 0)
ok_cupid_df['is_not_straight'] = np.where(ok_cupid_df['orientation'] != 'straight', 1, 0)
ok_cupid_df.head(2)
age status orientation body_type diet drinks drugs education ethnicity height ... speaks essay essay_len male religion_binary is_religious is_agnostic is_atheist is_straight is_not_straight
0 22 single straight a little extra strictly anything socially never working on college/university asian, white 75.0 ... english about me i would love to think that i was som... 2389 1 agnostic 0 1 0 1 0
1 35 single straight average mostly other often sometimes working on space camp white 70.0 ... english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 agnostic 0 1 0 1 0

2 rows × 29 columns


# relgns =ok_cupid_df['religion_binary'].unique()

# fig = go.Figure([go.Bar(x=relgns, y=ok_cupid_df["religion_binary"].value_counts().values, color=ok_cupid_df['is_straight'])])
# fig.show()
religious_views = ['religious', 'unknown_religion', 'agnostic', 'atheist']
fig = go.Figure(data=[
    go.Bar(name='Straight', 
           x=religious_views, 
           y=straight_df['religion'].value_counts().values, 
           text=straight_df['religion'].value_counts().values),
    go.Bar(name='Non-straight', 
           x=religious_views, 
           y=non_straight_df['religion'].value_counts().values,
           text=non_straight_df['religion'].value_counts().values)
])

fig.update_layout(barmode='group',
    title='Religions',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Religions',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
ok_cupid_df = ok_cupid_df.drop(columns=['is_not_straight', 'religion_binary'])
ok_cupid_df2.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay', 'essay_len'],
      dtype='object')
ok_cupid_df2.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay', 'essay_len'],
      dtype='object')
fig = go.Figure()

fig.add_trace(go.Histogram(x=male_df['age'], name='Male'))
fig.add_trace(go.Histogram(x=female_df['age'], name='Female'))

fig.update_layout(barmode='overlay',
                  title='Distribution of Age for Males and Females',
                  #xaxis_tickfont_size=14,
                  yaxis=dict(title='Count',
                             titlefont_size=16,
                             tickfont_size=14),
                  xaxis=dict(title='Age',
                             titlefont_size=16,
                             tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()
fig = go.Figure()

fig.add_trace(go.Histogram(x=male_df['height'], name='Male'))
fig.add_trace(go.Histogram(x=female_df['height'], name='Female'))

fig.update_layout(barmode='overlay',
                  title='Distribution of Heights for Males and Females',
                  #xaxis_tickfont_size=14,
                  yaxis=dict(title='Count',
                             titlefont_size=16,
                             tickfont_size=14),
                  xaxis=dict(title='Height',
                             titlefont_size=16,
                             tickfont_size=14))
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()
orientations = ['straight', 'bisexual', 'gay']
fig = go.Figure(data=[
    go.Bar(name='Male', 
           x=orientations, 
           y=male_df['orientation'].value_counts().values, 
           text=male_df['orientation'].value_counts().values),
    go.Bar(name='Female', 
           x=orientations, 
           y=female_df['orientation'].value_counts().values,
           text=female_df['orientation'].value_counts().values)
])

fig.update_layout(barmode='group',
    title='Orientations',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Orientation',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
orientations = ['average', 'average', 'curvy', 'skinny', 'rather not say', 'overweight', 'jacked', 'used up']
fig = go.Figure(data=[
    go.Bar(name='Male', 
           y=orientations, 
           x=male_df['body_type'].value_counts().values, 
           text=male_df['body_type'].value_counts().values, orientation='h'),
    go.Bar(name='Female', 
           y=orientations, 
           x=female_df['body_type'].value_counts().values,
           text=female_df['body_type'].value_counts().values, orientation='h')
])

fig.update_layout(barmode='group',
    title='Body Types for Male and Female',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Body Types',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()
orientations = ['single', 'available', 'seeing someone', 'married', 'unknown_status']
fig = go.Figure(data=[
    go.Bar(name='Male', 
           x=orientations, 
           y=male_df['status'].value_counts().values, 
           text=male_df['status'].value_counts().values),
    go.Bar(name='Female', 
           x=orientations, 
           y=female_df['status'].value_counts().values,
           text=female_df['status'].value_counts().values)
])

fig.update_layout(barmode='group',
    title='Status',
    #xaxis_tickfont_size=14,
    yaxis=dict(title='Counts',
               titlefont_size=16,
               tickfont_size=14),
    xaxis=dict(title='Status',
               titlefont_size=16,
               tickfont_size=14))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

Encoding:

Orientation, status, job

from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder
orientation_ohe = OneHotEncoder()
status_ohe = OneHotEncoder()
job_ohe = OneHotEncoder()

# Fit the OneHotEncoder and transform
orientation = pd.DataFrame(ok_cupid_df['orientation'])
orientation_encoded = orientation_ohe.fit_transform(orientation)
display(orientation_encoded)

status = pd.DataFrame(ok_cupid_df['status'])
status_encoded = status_ohe.fit_transform(status)
display(status_encoded)

job = pd.DataFrame(ok_cupid_df['job'])
job_encoded = job_ohe.fit_transform(job)
display(job_encoded)
<59941x3 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>



<59941x5 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>



<59941x22 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>
# Put into a dataframe to get column names
encoded_df_orientation = pd.DataFrame(orientation_encoded.toarray().astype(int), columns=orientation_ohe.categories_[0], dtype=int)
encoded_df_orientation = encoded_df_orientation.drop(encoded_df_orientation.columns[0], axis=1)
display(encoded_df_orientation.head(2))

# Status
encoded_df_status = pd.DataFrame(status_encoded.toarray().astype(int), columns=status_ohe.categories_[0], dtype=int)
encoded_df_status = encoded_df_status.drop(columns='unknown_status')
display(encoded_df_status.head(2))

# Jobs
encoded_df_job = pd.DataFrame(job_encoded.toarray().astype(int), columns=job_ohe.categories_[0], dtype=int)
encoded_df_job = encoded_df_job.drop(columns='unknown_job')
display(encoded_df_job.head(2))
gay straight
0 0 1
1 0 1
available married seeing someone single
0 0 0 0 1
1 0 0 0 1
artistic / musical / writer banking / financial / real estate clerical / administrative computer / hardware / software construction / craftsmanship education / academia entertainment / media executive / management hospitality / travel law / legal services ... military other political / government rather not say retired sales / marketing / biz dev science / tech / engineering student transportation unemployed
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 21 columns

encoded_df_job.rename(columns= {'other': 'otherjob'}, inplace = True)
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_orientation], axis=1)
ok_cupid_df.drop(columns='orientation', inplace=True)
ok_cupid_df.head(2)
age status body_type diet drinks drugs education ethnicity height income ... speaks essay essay_len male is_religious is_agnostic is_atheist is_straight gay straight
0 22 single a little extra strictly anything socially never working on college/university asian, white 75.0 -1 ... english about me i would love to think that i was som... 2389 1 0 1 0 1 0 1
1 35 single average mostly other often sometimes working on space camp white 70.0 80000 ... english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 0 1 0 1 0 1

2 rows × 28 columns

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_status], axis=1)
ok_cupid_df.drop(columns='status', inplace=True)
ok_cupid_df.head(2)
age body_type diet drinks drugs education ethnicity height income job ... is_religious is_agnostic is_atheist is_straight gay straight available married seeing someone single
0 22 a little extra strictly anything socially never working on college/university asian, white 75.0 -1 transportation ... 0 1 0 1 0 1 0 0 0 1
1 35 average mostly other often sometimes working on space camp white 70.0 80000 hospitality / travel ... 0 1 0 1 0 1 0 0 0 1

2 rows × 31 columns

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_job], axis=1)
ok_cupid_df.drop(columns='job', inplace=True)
ok_cupid_df.head(2)
age body_type diet drinks drugs education ethnicity height income last_online ... military otherjob political / government rather not say retired sales / marketing / biz dev science / tech / engineering student transportation unemployed
0 22 a little extra strictly anything socially never working on college/university asian, white 75.0 -1 2012-06-28-20-30 ... 0 0 0 0 0 0 0 0 1 0
1 35 average mostly other often sometimes working on space camp white 70.0 80000 2012-06-29-21-41 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 51 columns

Essays:

from sklearn.feature_extraction.text import CountVectorizer
# import the nltk stopwords
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Let's test it out
stemmer = nltk.stem.PorterStemmer()

ENGLISH_STOP_WORDS = stopwords.words('english')

def my_tokenizer(sentence):
    # remove punctuation and set to lower case
    # for punctuation_mark in string.punctuation:
    #     sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words
Requirement already satisfied: nltk in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (3.6.2)
Requirement already satisfied: regex in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (2021.4.4)
Requirement already satisfied: joblib in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (1.0.1)
Requirement already satisfied: click in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (7.1.2)
Requirement already satisfied: tqdm in /Users/puneetsran/opt/anaconda3/envs/plotly_bokeh/lib/python3.8/site-packages (from nltk) (4.60.0)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneetsran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
# 1. Instantiate 
# essay = CountVectorizer(stop_words="english", min_df=5, max_features=100, tokenizer = lambda x: [stemmer.stem(i) for i in x.split(' ')])
essay = CountVectorizer(min_df=5, max_features=100, tokenizer = my_tokenizer)

# 2. Fit 
essay.fit(ok_cupid_df["essay"])

# 3. Transform
essay_transformed = essay.transform(ok_cupid_df["essay"])
essay_transformed
<59941x100 sparse matrix of type '<class 'numpy.int64'>'
	with 2153638 stored elements in Compressed Sparse Row format>
essay_df = pd.DataFrame(columns=essay.get_feature_names(), data=essay_transformed.toarray())
#essay_df = essay0_df.drop(essay_df.columns[0], axis=1)
essay_df.head(2)
adventur also alway anyth around art back big book citi ... want watch way well work world would write year your
0 0 0 0 2 0 0 0 0 3 0 ... 4 0 3 0 3 0 2 0 1 0
1 1 0 1 1 1 0 0 1 0 0 ... 0 1 0 0 1 0 0 0 0 0

2 rows × 100 columns

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, essay_df], axis=1)
ok_cupid_df.head(2)
age body_type diet drinks drugs education ethnicity height income last_online ... want watch way well work world would write year your
0 22 a little extra strictly anything socially never working on college/university asian, white 75.0 -1 2012-06-28-20-30 ... 4 0 3 0 3 0 2 0 1 0
1 35 average mostly other often sometimes working on space camp white 70.0 80000 2012-06-29-21-41 ... 0 1 0 0 1 0 0 0 0 0

2 rows × 151 columns

# 1. Instantiate 
ethnicity = CountVectorizer()

# 2. Fit 
ethnicity.fit(ok_cupid_df["ethnicity"])

# 3. Transform
ethnicity_transformed = ethnicity.transform(ok_cupid_df["ethnicity"])
ethnicity_transformed
<59941x14 sparse matrix of type '<class 'numpy.int64'>'
	with 78195 stored elements in Compressed Sparse Row format>
ethnicity_transformed.toarray()
array([[0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])
ethnicity_df = pd.DataFrame(columns=ethnicity.get_feature_names(), data=ethnicity_transformed.toarray())

unknown_ethnicity = ethnicity_df[ethnicity_df['unknown_ethnicity'] == 1]['unknown_ethnicity']
unknown_ethnicity = unknown_ethnicity.sum()
# Drop one column to prevent redundant information
ethnicity_df = ethnicity_df.drop(columns='unknown_ethnicity')

ethnicity_df = ethnicity_df.rename(columns={'american': 'native_american', 'eastern': 'middle_eastern', 'islander': 'pacific_islander', 'hispanic': 'hispanic_latin'})
ethnicity_df = ethnicity_df.drop(columns=['native', 'middle', 'pacific', 'latin'])
ethnicity_df.head(2)
native_american asian black middle_eastern hispanic_latin indian pacific_islander other white
0 0 1 0 0 0 0 0 0 1
1 0 0 0 0 0 0 0 0 1
ethnicity_df.rename(columns = {'other':'other_ethnicity'}, inplace = True)
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, ethnicity_df], axis=1)
ok_cupid_df.head(2)
age body_type diet drinks drugs education ethnicity height income last_online ... your native_american asian black middle_eastern hispanic_latin indian pacific_islander other_ethnicity white
0 22 a little extra strictly anything socially never working on college/university asian, white 75.0 -1 2012-06-28-20-30 ... 0 0 1 0 0 0 0 0 0 1
1 35 average mostly other often sometimes working on space camp white 70.0 80000 2012-06-29-21-41 ... 0 0 0 0 0 0 0 0 0 1

2 rows × 160 columns

ethnicity_df['native_american'].value_counts()[1]
1265
native_american = ethnicity_df[ethnicity_df['native_american'] == 1]['native_american']
native_american = native_american.sum()

asian = ethnicity_df[ethnicity_df['asian'] == 1]['asian']
asian = asian.sum()

black = ethnicity_df[ethnicity_df['black'] == 1]['black']
black = black.sum()

middle_eastern = ethnicity_df[ethnicity_df['middle_eastern'] == 1]['middle_eastern']
middle_eastern = middle_eastern.sum()

hispanic_latin = ethnicity_df[ethnicity_df['hispanic_latin'] == 1]['hispanic_latin']
hispanic_latin = hispanic_latin.sum()

indian = ethnicity_df[ethnicity_df['indian'] == 1]['indian']
indian = indian.sum()

pacific_islander = ethnicity_df[ethnicity_df['pacific_islander'] == 1]['pacific_islander']
pacific_islander = pacific_islander.sum()

other_ethnicity = ethnicity_df[ethnicity_df['other_ethnicity'] == 1]['other_ethnicity']
other_ethnicity = other_ethnicity.sum()

white = ethnicity_df[ethnicity_df['white'] == 1]['white']
white = white.sum()
# intialise data of lists.
sum_ethnicities = {'ethnicity':['native american', 'asian', 'black',
                                'middle eastern', 'hispanic/latin',
                                'indian', 'pacific islander',
                                'other_ethnicity', 'white', 'unknown_ethnicity'],
        'sum':[native_american, asian, black, middle_eastern,
               hispanic_latin, indian, pacific_islander, other_ethnicity, 
               white, unknown_ethnicity]}
  
# Create DataFrame
ethnicities_sum = pd.DataFrame(sum_ethnicities)
ethnicities_sum['sum'].unique()
array([ 1265,  8205,  3328,   950,  5356,  1449,  1473,  3566, 37882,
        5677])
#fig = px.bar(ethnicities_sum, x='ethnicity', y='sum')
#fig.show()

labels = ethnicities_sum['ethnicity'].unique()
values = ethnicities_sum['sum'].unique()
fig = go.Figure(data=[go.Pie(labels=labels, 
                             values=values, 
                             hole=.5)])
fig.update_layout(title='Ethinicities')
fig.show()
ok_cupid_df.drop(columns='ethnicity', inplace=True)

Body type

ok_cupid_df['body_type'].value_counts()
average              14652
fit                  12711
athletic             11818
unknown_body_type     5292
thin                  4711
curvy                 3924
a little extra        2629
skinny                1777
full figured          1009
overweight             444
jacked                 421
used up                355
rather not say         198
Name: body_type, dtype: int64
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['thin'],'skinny')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['fit'],'athletic')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['unknown_body_type'],'rather not say')
ok_cupid_df['body_type'].value_counts()
athletic          24529
average           14652
curvy              7562
skinny             6488
rather not say     5490
overweight          444
jacked              421
used up             355
Name: body_type, dtype: int64
body_type_mapping = {'overweight':0, 'curvy':1, 'average':2, 'used up':3, 'rather not say': 4, 'skinny': 5, 'athletic': 6, 'jacked': 7}
body_type_mapped_data = ok_cupid_df['body_type'].map(body_type_mapping)
#ok_cupid_df = ok_cupid_df.drop(columns=['body_type'])
ok_cupid_df.drop(columns='body_type', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, body_type_mapped_data], axis=1)
ok_cupid_df.head(2)
age diet drinks drugs education height income last_online location offspring ... native_american asian black middle_eastern hispanic_latin indian pacific_islander other_ethnicity white body_type
0 22 strictly anything socially never working on college/university 75.0 -1 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them ... 0 1 0 0 0 0 0 0 1 1
1 35 mostly other often sometimes working on space camp 70.0 80000 2012-06-29-21-41 oakland, california doesn't have kids, but might want them ... 0 0 0 0 0 0 0 0 1 2

2 rows × 159 columns

Drinks

ok_cupid_df['drinks'].value_counts()
socially          41780
rarely             5957
often              5164
not at all         3267
unknown_drinks     2980
very often          471
desperately         322
Name: drinks, dtype: int64
drinks_mapping = {'desperately':6, 'very often':5, 'often':4, 'unknown_drinks':3, 'socially': 2, 'rarely': 1, 'not at all': 0}
drinks_mapped_data = ok_cupid_df['drinks'].map(drinks_mapping)
ok_cupid_df = ok_cupid_df.drop(columns=['drinks'])
ok_cupid_df = pd.concat([ok_cupid_df, drinks_mapped_data], axis=1)
ok_cupid_df.head(2)
age diet drugs education height income last_online location offspring pets ... asian black middle_eastern hispanic_latin indian pacific_islander other_ethnicity white body_type drinks
0 22 strictly anything never working on college/university 75.0 -1 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them likes dogs and likes cats ... 1 0 0 0 0 0 0 1 1 2
1 35 mostly other sometimes working on space camp 70.0 80000 2012-06-29-21-41 oakland, california doesn't have kids, but might want them likes dogs and likes cats ... 0 0 0 0 0 0 0 1 2 4

2 rows × 159 columns

Diet

ok_cupid_df['diet'].value_counts()
unknowndiet            24389
mostly anything        16585
anything                6183
strictly anything       5114
mostly vegetarian       3444
mostly other            1006
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    137
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
halal                     11
kosher                    11
Name: diet, dtype: int64
#ok_cupid_df["diet"] = ok_cupid_df["diet"].replace({'anything': 'strictly anything', 'vegetarian': 'strictly vegetarian', 'other': 'strictly other',
 #                            'vegan': 'strictly vegan', 'kosher': 'strictly kosher', 'halal': 'strictly halal'})
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()
unknowndiet           24389
mostlyanything        16585
anything               6183
strictlyanything       5114
mostlyvegetarian       3444
mostlyother            1006
strictlyvegetarian      875
vegetarian              667
strictlyother           452
mostlyvegan             338
other                   331
strictlyvegan           228
vegan                   137
mostlykosher             86
mostlyhalal              48
strictlykosher           18
strictlyhalal            18
kosher                   11
halal                    11
Name: diet, dtype: int64
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace('other', 'otherdiet')
#ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()
unknowndiet           24389
mostlyanything        16585
anything               6183
strictlyanything       5114
mostlyvegetarian       3444
mostlyotherdiet        1006
strictlyvegetarian      875
vegetarian              667
strictlyotherdiet       452
mostlyvegan             338
otherdiet               331
strictlyvegan           228
vegan                   137
mostlykosher             86
mostlyhalal              48
strictlykosher           18
strictlyhalal            18
kosher                   11
halal                    11
Name: diet, dtype: int64
diet = CountVectorizer()

diet_transformed = diet.fit_transform(ok_cupid_df["diet"])
diet_transformed
<59941x19 sparse matrix of type '<class 'numpy.int64'>'
	with 59941 stored elements in Compressed Sparse Row format>
diet_df = pd.DataFrame(columns=diet.get_feature_names(), data=diet_transformed.toarray())

# Drop one column to prevent redundant information
diet_df = diet_df.drop(columns=['unknowndiet'])
diet_df.head(2)
anything halal kosher mostlyanything mostlyhalal mostlykosher mostlyotherdiet mostlyvegan mostlyvegetarian otherdiet strictlyanything strictlyhalal strictlykosher strictlyotherdiet strictlyvegan strictlyvegetarian vegan vegetarian
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
diff_diets = diet_df.columns.tolist()

for d in diff_diets:
    if d.startswith('mostly'):
        diff_diets.remove(d)

for d in diff_diets:
    if d.startswith('strictly'):
        diff_diets.remove(d)
diets = diff_diets

diets_mostly = []
for diet in diets:
    diets_mostly.append('mostly'+diet)
    
diets_strictly = []
for diet in diets:
    diets_strictly.append('strictly'+diet)
for diet in diets:
    if diet in diet_df.columns:
        diet_df.loc[diet_df[diet] == 1, diet] = 2
        
for diet, diet_mostly in zip(diets, diets_mostly):
    if diet_mostly in diet_df.columns:
        diet_df.loc[diet_df[diet_mostly] == 1, diet] = 1
        diet_df = diet_df.drop(columns=diet_mostly)

for diet, diet_strictly in zip(diets, diets_strictly):
    if diet_strictly in diet_df.columns:
        diet_df.loc[diet_df[diet_strictly] == 1, diet] = 3
        diet_df = diet_df.drop(columns=diet_strictly)
diet_df.head(2)
anything halal kosher otherdiet vegan vegetarian
0 3 0 0 0 0 0
1 0 0 0 0 0 0
diet_df.rename(columns = {'diet':'otherdiet'}, inplace = True)
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, diet_df], axis=1)
ok_cupid_df.drop(columns='diet', inplace=True)
ok_cupid_df.head(2)
age drugs education height income last_online location offspring pets religion ... other_ethnicity white body_type drinks anything halal kosher otherdiet vegan vegetarian
0 22 never working on college/university 75.0 -1 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it ... 0 1 1 2 3 0 0 0 0 0
1 35 sometimes working on space camp 70.0 80000 2012-06-29-21-41 oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it ... 0 1 2 4 0 0 0 0 0 0

2 rows × 164 columns

Drugs

ok_cupid_df['drugs'].value_counts()
never            37722
unknown_drugs    14077
sometimes         7732
often              410
Name: drugs, dtype: int64
drugs_mapping = {'often':3, 'sometimes':2, 'unknown_drugs':1, 'never':0}
drugs_mapped_data = ok_cupid_df['drugs'].map(drugs_mapping)
drugs_mapped_data = drugs_mapped_data.astype(int)
ok_cupid_df = ok_cupid_df.drop(columns=['drugs'])
ok_cupid_df = pd.concat([ok_cupid_df, drugs_mapped_data], axis=1)
ok_cupid_df.head(2)
age education height income last_online location offspring pets religion sign ... white body_type drinks anything halal kosher otherdiet vegan vegetarian drugs
0 22 working on college/university 75.0 -1 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini ... 1 1 2 3 0 0 0 0 0 0
1 35 working on space camp 70.0 80000 2012-06-29-21-41 oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer ... 1 2 4 0 0 0 0 0 0 2

2 rows × 164 columns

Education:

ok_cupid_df['education'].value_counts()
graduated from college/university    23959
graduated from masters program        8961
unknown_education                     6624
working on college/university         5712
working on masters program            1682
graduated from two-year college       1531
graduated from high school            1428
graduated from ph.d program           1272
graduated from law school             1122
working on two-year college           1074
dropped out of college/university      995
working on ph.d program                983
college/university                     801
graduated from space camp              657
dropped out of space camp              523
graduated from med school              446
working on space camp                  445
working on law school                  269
two-year college                       222
working on med school                  212
dropped out of two-year college        191
dropped out of masters program         140
masters program                        136
dropped out of ph.d program            127
dropped out of high school             102
high school                             96
working on high school                  87
space camp                              58
ph.d program                            26
law school                              19
dropped out of law school               18
dropped out of med school               12
med school                              11
Name: education, dtype: int64
education_mapping = {'dropped out of space camp':0, 'working on space camp':1, 'graduated from space camp':2,
                     'space camp': 2,
                     'dropped out of high school':3, 'working on high school': 4, 'graduated from high school': 5,
                     'high school': 5,
                     'dropped out of two-year college': 6, 'working on two-year college': 7,
                     'two-year college': 8,
                     'graduated from two-year college': 8, 'dropped out of college/university': 9,
                     'college/university': 11,
                     'working on college/university': 10, 'graduated from college/university': 11, 'unknown_education': 12,
                     'dropped out of masters program': 13, 'working on masters program': 14,
                     'masters program': 15,
                     'graduated from masters program': 15, 'dropped out of law school': 16,
                     'working on law school': 17, 'graduated from law school': 18,
                     'law school': 18,
                     'dropped out of ph.d program': 16, 'working on ph.d program': 17,
                     'ph.d program': 18,
                     'graduated from ph.d program': 18, 'dropped out of med school': 16, 
                     'working on med school': 17, 'graduated from med school': 18, 'med school': 18}
education_mapped_data = ok_cupid_df['education'].map(education_mapping)
education_mapped_data = education_mapped_data.astype(int)
ok_cupid_df = ok_cupid_df.drop(columns=['education'])
ok_cupid_df = pd.concat([ok_cupid_df, education_mapped_data], axis=1)
ok_cupid_df.head(2)
age height income last_online location offspring pets religion sign smokes ... body_type drinks anything halal kosher otherdiet vegan vegetarian drugs education
0 22 75.0 -1 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini sometimes ... 1 2 3 0 0 0 0 0 0 10
1 35 70.0 80000 2012-06-29-21-41 oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer no ... 2 4 0 0 0 0 0 0 2 1

2 rows × 164 columns

Smokes:

ok_cupid_df['smokes'].value_counts()
no                43895
unknown_smokes     5509
sometimes          3787
when drinking      3039
yes                2231
trying to quit     1480
Name: smokes, dtype: int64
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder # This is used for multiple columns

# Instantiate the label encoder
le = LabelEncoder()

# Fit and transform the order priority column
le.fit_transform(ok_cupid_df['smokes'])
array([1, 0, 0, ..., 0, 2, 1])
le.classes_
array(['no', 'sometimes', 'trying to quit', 'unknown_smokes',
       'when drinking', 'yes'], dtype=object)
smokes_mapping = {'yes':4, 'sometimes':3, 'when drinking':3, 'unknown_smokes':2, 'trying to quit':1, 'no': 0}
smokes_mapped_data = ok_cupid_df['smokes'].map(smokes_mapping)
smokes_mapped_data = smokes_mapped_data.astype(int)
ok_cupid_df = ok_cupid_df.drop(columns=['smokes'])
ok_cupid_df = pd.concat([ok_cupid_df, smokes_mapped_data], axis=1)
ok_cupid_df.head(2)
age height income last_online location offspring pets religion sign speaks ... drinks anything halal kosher otherdiet vegan vegetarian drugs education smokes
0 22 75.0 -1 2012-06-28-20-30 south san francisco, california doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini english ... 2 3 0 0 0 0 0 0 10 3
1 35 70.0 80000 2012-06-29-21-41 oakland, california doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer english (fluently), spanish (poorly), french (... ... 4 0 0 0 0 0 0 2 1 0

2 rows × 164 columns

Location:

ok_cupid_df['location'].str.split(',', expand=True).head(2)
0 1 2
0 south san francisco california None
1 oakland california None
ok_cupid_df['location'].str.split(',', expand=True)[1].unique()
array([' california', ' colorado', ' new york', ' oregon', ' arizona',
       ' hawaii', ' montana', ' wisconsin', ' virginia', ' spain',
       ' nevada', ' illinois', ' vietnam', ' ireland', ' louisiana',
       ' michigan', ' texas', ' united kingdom', ' massachusetts',
       ' north carolina', ' idaho', ' mississippi', ' new jersey',
       ' florida', ' minnesota', ' georgia', ' utah', ' washington',
       ' west virginia', ' connecticut', ' tennessee', ' rhode island',
       ' district of columbia', ' british columbia', ' missouri',
       ' germany', ' pennsylvania', ' netherlands', ' switzerland',
       ' ohio'], dtype=object)
ok_cupid_df['city'] = ok_cupid_df['location'].str.split(',', expand=True)[0]
ok_cupid_df_copy_city = ok_cupid_df.copy()
city_data = {'city':ok_cupid_df_copy_city['city'].unique()} 
    
# Convert the dictionary into DataFrame 
cities_df = pd.DataFrame(city_data) 
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

longitude = []
latitude = []
def findGeocode(city):
    try:
        geolocator = Nominatim(user_agent="your_app_name")
        return geolocator.geocode(city)
    except GeocoderTimedOut:
        return findGeocode(city)	
for i in (cities_df["city"]):
    if findGeocode(i) != None:
        loc = findGeocode(i)
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
    
    else:
        latitude.append(np.nan)
        longitude.append(np.nan)
cities_df["longitude"] = longitude
cities_df["latitude"] = latitude
  
cities_df.head(2)
city longitude latitude
0 south san francisco -122.416866 37.653540
1 oakland -122.271356 37.804456
#cities_df.rename(columns = {'City':'city'}, inplace = True)
df_cities = ok_cupid_df_copy_city['city'].value_counts().rename_axis('city').reset_index(name='counts')
df_cities.head(2)
city counts
0 san francisco 31063
1 oakland 7214
df_cities = pd.merge(cities_df, df_cities, on='city', how='inner')
df_cities.head(2)
city longitude latitude counts
0 south san francisco -122.416866 37.653540 416
1 oakland -122.271356 37.804456 7214
fig = px.scatter_mapbox(df_cities, lat="latitude", lon="longitude", 
                        size='counts',
                        color='city',
                        color_continuous_scale=px.colors.cyclical.IceFire, 
                        #size_max=15, zoom=10,
                        mapbox_style="carto-positron"
                       )
fig.show()
ok_cupid_df['state'] = ok_cupid_df['location'].str.split(',', expand=True)[1]
ok_cupid_df['state'] = ok_cupid_df['state'].str.lstrip(' ')
ok_cupid_df['state'].unique()
array(['california', 'colorado', 'new york', 'oregon', 'arizona',
       'hawaii', 'montana', 'wisconsin', 'virginia', 'spain', 'nevada',
       'illinois', 'vietnam', 'ireland', 'louisiana', 'michigan', 'texas',
       'united kingdom', 'massachusetts', 'north carolina', 'idaho',
       'mississippi', 'new jersey', 'florida', 'minnesota', 'georgia',
       'utah', 'washington', 'west virginia', 'connecticut', 'tennessee',
       'rhode island', 'district of columbia', 'british columbia',
       'missouri', 'germany', 'pennsylvania', 'netherlands',
       'switzerland', 'ohio'], dtype=object)
ok_cupid_df['country'] = 'united states'
ok_cupid_df.loc[ok_cupid_df['state'] == 'spain', 'state'] = 'madrid'
ok_cupid_df.loc[ok_cupid_df['city'] == 'madrid', 'country'] = 'spain'

ok_cupid_df.loc[ok_cupid_df['state'] == 'vietnam', 'state'] = 'khanh hoa'
ok_cupid_df.loc[ok_cupid_df['city'] == 'nha trang', 'country'] = 'vietnam'

ok_cupid_df.loc[ok_cupid_df['state'] == 'ireland', 'state'] = 'munster'
ok_cupid_df.loc[ok_cupid_df['city'] == 'cork', 'country'] = 'ireland'

ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'state'] = 'scotland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'country'] = 'united kingdom'

ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'state'] = 'england'
ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'country'] = 'united kingdom'

ok_cupid_df.loc[ok_cupid_df['state'] == 'germany', 'state'] = 'hessen'
ok_cupid_df.loc[ok_cupid_df['city'] == 'kassel', 'country'] = 'germany'

ok_cupid_df.loc[ok_cupid_df['state'] == 'netherlands', 'state'] = 'north holland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'amsterdam', 'country'] = 'netherlands'

ok_cupid_df.loc[ok_cupid_df['state'] == 'switzerland', 'state'] = 'graubunden'
ok_cupid_df.loc[ok_cupid_df['city'] == 'bonaduz', 'country'] = 'switzerland'

ok_cupid_df.loc[ok_cupid_df['city'] == 'vancouver', 'country'] = 'canada'

ok_cupid_df = ok_cupid_df.drop(columns=['location'])
ok_cupid_df[ok_cupid_df['country'] == 'canada']
age height income last_online offspring pets religion sign speaks essay ... kosher otherdiet vegan vegetarian drugs education smokes city state country
42435 32 63.0 60000 2012-06-28-18-38 doesn't have kids unknown_pets other aquarius english (fluently), chinese (poorly), french (... im happiest when wearing sunglasses and flipfl... ... 0 0 0 0 0 1 0 vancouver british columbia canada

1 rows × 166 columns

ok_cupid_df['state'].unique()
array(['california', 'colorado', 'new york', 'oregon', 'arizona',
       'hawaii', 'montana', 'wisconsin', 'virginia', 'madrid', 'nevada',
       'illinois', 'khanh hoa', 'munster', 'louisiana', 'michigan',
       'texas', 'scotland', 'england', 'massachusetts', 'north carolina',
       'idaho', 'mississippi', 'new jersey', 'florida', 'minnesota',
       'georgia', 'utah', 'washington', 'west virginia', 'connecticut',
       'tennessee', 'rhode island', 'district of columbia',
       'british columbia', 'missouri', 'hessen', 'pennsylvania',
       'north holland', 'graubunden', 'ohio'], dtype=object)
ok_cupid_df['country'].unique()
array(['united states', 'spain', 'vietnam', 'ireland', 'united kingdom',
       'canada', 'germany', 'netherlands', 'switzerland'], dtype=object)
# Instantiate the OneHotEncoder
city_ohe = OneHotEncoder()
state_ohe = OneHotEncoder()
country_ohe = OneHotEncoder()

# Fit the OneHotEncoder to the subcategory column and transform
# Expects a 2D array
city = pd.DataFrame(ok_cupid_df['city'])
city_encoded = city_ohe.fit_transform(city)
display(city_encoded)

state = pd.DataFrame(ok_cupid_df['state'])
state_encoded = state_ohe.fit_transform(state)
display(state_encoded)

country = pd.DataFrame(ok_cupid_df['country'])
country_encoded = country_ohe.fit_transform(country)
display(country_encoded)
<59941x197 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>



<59941x41 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>



<59941x9 sparse matrix of type '<class 'numpy.float64'>'
	with 59941 stored elements in Compressed Sparse Row format>
# Put into a dataframe to get column names
encoded_df_city = pd.DataFrame(city_encoded.toarray().astype(int), columns=city_ohe.categories_[0], dtype=int)
encoded_df_city = encoded_df_city.drop(encoded_df_city.columns[0], axis=1)
display(encoded_df_city.head(2))

# Status
encoded_df_state = pd.DataFrame(state_encoded.toarray().astype(int), columns=state_ohe.categories_[0], dtype=int)
encoded_df_state = encoded_df_state.drop(encoded_df_state.columns[0], axis=1)
display(encoded_df_state.head(2))

# Jobs
encoded_df_country = pd.DataFrame(country_encoded.toarray().astype(int), columns=country_ohe.categories_[0], dtype=int)
encoded_df_country = encoded_df_country.drop(encoded_df_country.columns[0], axis=1)
display(encoded_df_country.head(2))
albany amsterdam arcadia asheville ashland astoria atherton atlanta austin bayshore ... vallejo vancouver walnut creek washington waterford west oakland westlake woodacre woodbridge woodside
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 196 columns

british columbia california colorado connecticut district of columbia england florida georgia graubunden hawaii ... pennsylvania rhode island scotland tennessee texas utah virginia washington west virginia wisconsin
0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 40 columns

germany ireland netherlands spain switzerland united kingdom united states vietnam
0 0 0 0 0 0 0 1 0
1 0 0 0 0 0 0 1 0
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_city], axis=1)
ok_cupid_df.drop(columns='city', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_state], axis=1)
ok_cupid_df.drop(columns='state', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_country], axis=1)
ok_cupid_df.drop(columns='country', inplace=True)

ok_cupid_df.head(2)
age height income last_online offspring pets religion sign speaks essay ... west virginia wisconsin germany ireland netherlands spain switzerland united kingdom united states vietnam
0 22 75.0 -1 2012-06-28-20-30 doesn't have kids, but might want them likes dogs and likes cats agnosticism and very serious about it gemini english about me i would love to think that i was som... ... 0 0 0 0 0 0 0 0 1 0
1 35 70.0 80000 2012-06-29-21-41 doesn't have kids, but might want them likes dogs and likes cats agnosticism but not too serious about it cancer english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... ... 0 0 0 0 0 0 0 0 1 0

2 rows × 407 columns

Offspring:

ok_cupid_df['offspring'].value_counts()
unknown_offspring                          35558
doesn't have kids                           7559
doesn't have kids, but might want them      3875
doesn't have kids, but wants them           3565
doesn't want kids                           2927
has kids                                    1883
has a kid                                   1881
doesn't have kids, and doesn't want any     1132
has kids, but doesn't want more              442
has a kid, but doesn't want more             275
has a kid, and might want more               231
wants kids                                   225
might want kids                              181
has kids, and might want more                115
has a kid, and wants more                     71
has kids, and wants more                      21
Name: offspring, dtype: int64
ok_cupid_df['offspring'] = ok_cupid_df['offspring'].replace({"doesn't have kids": "doesnt_have_kids",
                                                             "doesn't have kids, but might want them": "doesnt_have_kids might_want_kids",
                                                             "doesn't have kids, but wants them": "doesnt_have_kids wants_kids",
                                                             "doesn't want kids": "doesnt_want_kids",
                                                             "has kids": "has_kids",
                                                             "has a kid": "has_a_kid",
                                                             "doesn't have kids, and doesn't want any": "doesnt_have_kids doesnt_want_kids",
                                                             "has kids, but doesn't want more": "has_kids doesnt_want_kids",
                                                             "has a kid, but doesn't want more": "has_a_kid doesnt_want_kids",
                                                             "has a kid, and might want more": "has_a_kid might_want_kids",
                                                             "wants kids": "wants_kids",
                                                             "might want kids": "might_want_kids",
                                                             "has kids, and might want more": "has_kids might_want_kids",
                                                             "has a kid, and wants more": "has_a_kid wants_kids",
                                                             "has kids, and wants more": "has_kids wants_kids"})

ok_cupid_df['offspring'].value_counts()
unknown_offspring                    35558
doesnt_have_kids                      7559
doesnt_have_kids might_want_kids      3875
doesnt_have_kids wants_kids           3565
doesnt_want_kids                      2927
has_kids                              1883
has_a_kid                             1881
doesnt_have_kids doesnt_want_kids     1132
has_kids doesnt_want_kids              442
has_a_kid doesnt_want_kids             275
has_a_kid might_want_kids              231
wants_kids                             225
might_want_kids                        181
has_kids might_want_kids               115
has_a_kid wants_kids                    71
has_kids wants_kids                     21
Name: offspring, dtype: int64
# 1. Instantiate 
offspring = CountVectorizer()

# 2. Fit 
offspring.fit(ok_cupid_df["offspring"])

# 3. Transform
offspring_transformed = offspring.transform(ok_cupid_df["offspring"])
offspring_transformed
<59941x7 sparse matrix of type '<class 'numpy.int64'>'
	with 69668 stored elements in Compressed Sparse Row format>
offspring_df = pd.DataFrame(columns=offspring.get_feature_names(), data=offspring_transformed.toarray())
offspring_df_copy = offspring_df.copy()
# Drop one column to prevent redundant information
offspring_df = offspring_df.drop(columns='unknown_offspring')
offspring_df.head(2)
doesnt_have_kids doesnt_want_kids has_a_kid has_kids might_want_kids wants_kids
0 1 0 0 0 1 0
1 1 0 0 0 1 0
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, offspring_df], axis=1)
ok_cupid_df.drop(columns='offspring', inplace=True)
ok_cupid_df.head(2)
age height income last_online pets religion sign speaks essay essay_len ... switzerland united kingdom united states vietnam doesnt_have_kids doesnt_want_kids has_a_kid has_kids might_want_kids wants_kids
0 22 75.0 -1 2012-06-28-20-30 likes dogs and likes cats agnosticism and very serious about it gemini english about me i would love to think that i was som... 2389 ... 0 0 1 0 1 0 0 0 1 0
1 35 70.0 80000 2012-06-29-21-41 likes dogs and likes cats agnosticism but not too serious about it cancer english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 ... 0 0 1 0 1 0 0 0 1 0

2 rows × 412 columns

Pets:

ok_cupid_df['pets'].value_counts()
unknown_pets                       19917
likes dogs and likes cats          14814
likes dogs                          7224
likes dogs and has cats             4313
has dogs                            4134
has dogs and likes cats             2333
likes dogs and dislikes cats        2029
has dogs and has cats               1474
has cats                            1406
likes cats                          1062
has dogs and dislikes cats           552
dislikes dogs and likes cats         240
dislikes dogs and dislikes cats      196
dislikes cats                        122
dislikes dogs and has cats            81
dislikes dogs                         44
Name: pets, dtype: int64
ok_cupid_df['pets'] = ok_cupid_df['pets'].replace({'likes dogs and likes cats': 'likes_dogs likes_cats',
                                                   'likes dogs': 'likes_dogs',
                                                   'likes dogs and has cats': 'likes_dogs has_cats',
                                                   'has dogs': 'has_dogs',
                                                   'has dogs and likes cats': 'has_dogs likes_cats',
                                                   'likes dogs and dislikes cats': 'likes_dogs dislikes_cats',
                                                   'has dogs and has cats': 'has_dogs has_cats',
                                                   'has cats': 'has_cats',
                                                   'likes cats': 'likes_cats',
                                                   'has dogs and dislikes cats': 'has_dogs dislikes_cats',
                                                   'dislikes dogs and likes cats': 'dislikes_dogs likes_cats',
                                                   'dislikes dogs and dislikes cats': 'dislikes_dogs dislikes_cats',
                                                   'dislikes cats': 'dislikes_cats',
                                                   'dislikes dogs and has cats': 'dislikes_dogs has_cats',
                                                   'dislikes dogs': 'dislikes_dogs'})

ok_cupid_df['pets'].value_counts()
unknown_pets                   19917
likes_dogs likes_cats          14814
likes_dogs                      7224
likes_dogs has_cats             4313
has_dogs                        4134
has_dogs likes_cats             2333
likes_dogs dislikes_cats        2029
has_dogs has_cats               1474
has_cats                        1406
likes_cats                      1062
has_dogs dislikes_cats           552
dislikes_dogs likes_cats         240
dislikes_dogs dislikes_cats      196
dislikes_cats                    122
dislikes_dogs has_cats            81
dislikes_dogs                     44
Name: pets, dtype: int64
# 1. Instantiate 
pets = CountVectorizer()

# 2. Fit 
pets.fit(ok_cupid_df["pets"])

# 3. Transform
pets_transformed = pets.transform(ok_cupid_df["pets"])
pets_transformed
<59941x7 sparse matrix of type '<class 'numpy.int64'>'
	with 85973 stored elements in Compressed Sparse Row format>
pets_df = pd.DataFrame(columns=pets.get_feature_names(), data=pets_transformed.toarray())
pets_df_copy = pets_df.copy()
# Drop one column to prevent redundant information
pets_df = pets_df.drop(columns='unknown_pets')

pets_df.head(2)
dislikes_cats dislikes_dogs has_cats has_dogs likes_cats likes_dogs
0 0 0 0 0 1 1
1 0 0 0 0 1 1
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, pets_df], axis=1)
ok_cupid_df.drop(columns='pets', inplace=True)
ok_cupid_df.head(2)
age height income last_online religion sign speaks essay essay_len male ... has_a_kid has_kids might_want_kids wants_kids dislikes_cats dislikes_dogs has_cats has_dogs likes_cats likes_dogs
0 22 75.0 -1 2012-06-28-20-30 agnosticism and very serious about it gemini english about me i would love to think that i was som... 2389 1 ... 0 0 1 0 0 0 0 0 1 1
1 35 70.0 80000 2012-06-29-21-41 agnosticism but not too serious about it cancer english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 ... 0 0 1 0 0 0 0 0 1 1

2 rows × 417 columns

Religion:

ok_cupid_df['religion'].value_counts()
unknown_religion                              20222
agnosticism                                    2724
other                                          2691
agnosticism but not too serious about it       2636
agnosticism and laughing about it              2496
catholicism but not too serious about it       2318
atheism                                        2175
other and laughing about it                    2119
atheism and laughing about it                  2074
christianity                                   1957
christianity but not too serious about it      1952
other but not too serious about it             1554
judaism but not too serious about it           1517
atheism but not too serious about it           1318
catholicism                                    1064
christianity and somewhat serious about it      927
atheism and somewhat serious about it           848
other and somewhat serious about it             845
catholicism and laughing about it               726
judaism and laughing about it                   681
buddhism but not too serious about it           650
agnosticism and somewhat serious about it       642
judaism                                         612
christianity and very serious about it          578
atheism and very serious about it               570
catholicism and somewhat serious about it       548
other and very serious about it                 533
buddhism and laughing about it                  466
buddhism                                        403
christianity and laughing about it              373
buddhism and somewhat serious about it          359
agnosticism and very serious about it           314
judaism and somewhat serious about it           266
hinduism but not too serious about it           227
hinduism                                        107
catholicism and very serious about it           102
buddhism and very serious about it               70
hinduism and somewhat serious about it           58
islam                                            48
hinduism and laughing about it                   44
islam but not too serious about it               40
islam and somewhat serious about it              22
judaism and very serious about it                22
islam and laughing about it                      16
hinduism and very serious about it               14
islam and very serious about it                  13
Name: religion, dtype: int64
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace(' ', '')
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace('other', 'otherreligion')
ok_cupid_df['religion'].value_counts()
unknown_religion                          20222
agnosticism                                2724
otherreligion                              2691
agnosticismbutnottooseriousaboutit         2636
agnosticismandlaughingaboutit              2496
catholicismbutnottooseriousaboutit         2318
atheism                                    2175
otherreligionandlaughingaboutit            2119
atheismandlaughingaboutit                  2074
christianity                               1957
christianitybutnottooseriousaboutit        1952
otherreligionbutnottooseriousaboutit       1554
judaismbutnottooseriousaboutit             1517
atheismbutnottooseriousaboutit             1318
catholicism                                1064
christianityandsomewhatseriousaboutit       927
atheismandsomewhatseriousaboutit            848
otherreligionandsomewhatseriousaboutit      845
catholicismandlaughingaboutit               726
judaismandlaughingaboutit                   681
buddhismbutnottooseriousaboutit             650
agnosticismandsomewhatseriousaboutit        642
judaism                                     612
christianityandveryseriousaboutit           578
atheismandveryseriousaboutit                570
catholicismandsomewhatseriousaboutit        548
otherreligionandveryseriousaboutit          533
buddhismandlaughingaboutit                  466
buddhism                                    403
christianityandlaughingaboutit              373
buddhismandsomewhatseriousaboutit           359
agnosticismandveryseriousaboutit            314
judaismandsomewhatseriousaboutit            266
hinduismbutnottooseriousaboutit             227
hinduism                                    107
catholicismandveryseriousaboutit            102
buddhismandveryseriousaboutit                70
hinduismandsomewhatseriousaboutit            58
islam                                        48
hinduismandlaughingaboutit                   44
islambutnottooseriousaboutit                 40
islamandsomewhatseriousaboutit               22
judaismandveryseriousaboutit                 22
islamandlaughingaboutit                      16
hinduismandveryseriousaboutit                14
islamandveryseriousaboutit                   13
Name: religion, dtype: int64
religion = CountVectorizer()

religion_transformed = religion.fit_transform(ok_cupid_df["religion"])
religion_transformed
<59941x46 sparse matrix of type '<class 'numpy.int64'>'
	with 59941 stored elements in Compressed Sparse Row format>
religion_df = pd.DataFrame(columns=religion.get_feature_names(), data=religion_transformed.toarray())

# Drop one column to prevent redundant information
religion_df = religion_df.drop(columns=['unknown_religion'])
religion_df.head(2)
agnosticism agnosticismandlaughingaboutit agnosticismandsomewhatseriousaboutit agnosticismandveryseriousaboutit agnosticismbutnottooseriousaboutit atheism atheismandlaughingaboutit atheismandsomewhatseriousaboutit atheismandveryseriousaboutit atheismbutnottooseriousaboutit ... judaism judaismandlaughingaboutit judaismandsomewhatseriousaboutit judaismandveryseriousaboutit judaismbutnottooseriousaboutit otherreligion otherreligionandlaughingaboutit otherreligionandsomewhatseriousaboutit otherreligionandveryseriousaboutit otherreligionbutnottooseriousaboutit
0 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 45 columns

rel = religion_df.columns.tolist()

for r in rel:
    if r.endswith('andveryseriousaboutit'):
        rel.remove(r)

for r in rel:
    if r.endswith('andlaughingaboutit'):
        rel.remove(r)    
        
for r in rel:
    if r.endswith('andsomewhatseriousaboutit'):
        rel.remove(r)
        
for r in rel:
    if r.endswith('butnottooseriousaboutit'):
        rel.remove(r)
religions = rel

religions_serious = []
for religion in religions:
    religions_serious.append(religion+'andveryseriousaboutit')
    
religions_laughing = []
for religion in religions:
    religions_laughing.append(religion+'andlaughingaboutit')
    
religions_somewhat = []
for religion in religions:
    religions_somewhat.append(religion+'andsomewhatseriousaboutit')
    
religions_not_serious = []
for religion in religions:
    religions_not_serious.append(religion+'butnottooseriousaboutit')
for religion in religions:
    if religion in religion_df.columns:
        religion_df.loc[religion_df[religion] == 1, religion] = 4
        
for religion, religion_serious in zip(religions, religions_serious):
    if religion_serious in religion_df.columns:
        religion_df.loc[religion_df[religion_serious] == 1, religion] = 4
        religion_df = religion_df.drop(columns=religion_serious)

for religion, religion_laughing in zip(religions, religions_laughing):
    if religion_laughing in religion_df.columns:
        religion_df.loc[religion_df[religion_laughing] == 1, religion] = 1
        religion_df = religion_df.drop(columns=religion_laughing)
        
for religion, religion_somewhat in zip(religions, religions_somewhat):
    if religion_somewhat in religion_df.columns:
        religion_df.loc[religion_df[religion_somewhat] == 1, religion] = 3
        religion_df = religion_df.drop(columns=religion_somewhat)
        
for religion, religion_not_serious in zip(religions, religions_not_serious):
    if religion_not_serious in religion_df.columns:
        religion_df.loc[religion_df[religion_not_serious] == 1, religion] = 2
        religion_df = religion_df.drop(columns=religion_not_serious)
religion_df.head(2)
agnosticism atheism buddhism catholicism christianity hinduism islam judaism otherreligion
0 4 0 0 0 0 0 0 0 0
1 2 0 0 0 0 0 0 0 0
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, religion_df], axis=1)
ok_cupid_df.drop(columns='religion', inplace=True)
ok_cupid_df.head(2)
age height income last_online sign speaks essay essay_len male is_religious ... likes_dogs agnosticism atheism buddhism catholicism christianity hinduism islam judaism otherreligion
0 22 75.0 -1 2012-06-28-20-30 gemini english about me i would love to think that i was som... 2389 1 0 ... 1 4 0 0 0 0 0 0 0 0
1 35 70.0 80000 2012-06-29-21-41 cancer english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 0 ... 1 2 0 0 0 0 0 0 0 0

2 rows × 425 columns

Sign:

ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace('&', '').str.replace('rsquo;','')
ok_cupid_df['sign'].value_counts()
unknown_sign                              11053
gemini and its fun to think about          1782
scorpio and its fun to think about         1772
leo and its fun to think about             1692
libra and its fun to think about           1649
taurus and its fun to think about          1640
cancer and its fun to think about          1597
pisces and its fun to think about          1592
sagittarius and its fun to think about     1583
virgo and its fun to think about           1574
aries and its fun to think about           1573
aquarius and its fun to think about        1503
virgo but it doesnt matter                 1497
leo but it doesnt matter                   1457
cancer but it doesnt matter                1454
gemini but it doesnt matter                1453
taurus but it doesnt matter                1450
libra but it doesnt matter                 1408
aquarius but it doesnt matter              1407
capricorn and its fun to think about       1376
sagittarius but it doesnt matter           1375
aries but it doesnt matter                 1373
capricorn but it doesnt matter             1319
pisces but it doesnt matter                1300
scorpio but it doesnt matter               1264
leo                                        1159
libra                                      1098
cancer                                     1092
virgo                                      1029
scorpio                                    1020
gemini                                     1013
taurus                                     1001
aries                                       995
pisces                                      992
aquarius                                    954
sagittarius                                 937
capricorn                                   833
scorpio and it matters a lot                 78
leo and it matters a lot                     66
cancer and it matters a lot                  63
aquarius and it matters a lot                63
pisces and it matters a lot                  62
gemini and it matters a lot                  62
libra and it matters a lot                   52
taurus and it matters a lot                  49
aries and it matters a lot                   47
sagittarius and it matters a lot             47
capricorn and it matters a lot               45
virgo and it matters a lot                   41
Name: sign, dtype: int64
ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace(' ', '')
ok_cupid_df['sign'].value_counts()
unknown_sign                        11053
geminianditsfuntothinkabout          1782
scorpioanditsfuntothinkabout         1772
leoanditsfuntothinkabout             1692
libraanditsfuntothinkabout           1649
taurusanditsfuntothinkabout          1640
canceranditsfuntothinkabout          1597
piscesanditsfuntothinkabout          1592
sagittariusanditsfuntothinkabout     1583
virgoanditsfuntothinkabout           1574
ariesanditsfuntothinkabout           1573
aquariusanditsfuntothinkabout        1503
virgobutitdoesntmatter               1497
leobutitdoesntmatter                 1457
cancerbutitdoesntmatter              1454
geminibutitdoesntmatter              1453
taurusbutitdoesntmatter              1450
librabutitdoesntmatter               1408
aquariusbutitdoesntmatter            1407
capricornanditsfuntothinkabout       1376
sagittariusbutitdoesntmatter         1375
ariesbutitdoesntmatter               1373
capricornbutitdoesntmatter           1319
piscesbutitdoesntmatter              1300
scorpiobutitdoesntmatter             1264
leo                                  1159
libra                                1098
cancer                               1092
virgo                                1029
scorpio                              1020
gemini                               1013
taurus                               1001
aries                                 995
pisces                                992
aquarius                              954
sagittarius                           937
capricorn                             833
scorpioanditmattersalot                78
leoanditmattersalot                    66
canceranditmattersalot                 63
aquariusanditmattersalot               63
piscesanditmattersalot                 62
geminianditmattersalot                 62
libraanditmattersalot                  52
taurusanditmattersalot                 49
ariesanditmattersalot                  47
sagittariusanditmattersalot            47
capricornanditmattersalot              45
virgoanditmattersalot                  41
Name: sign, dtype: int64
sign = CountVectorizer()

sign_transformed = sign.fit_transform(ok_cupid_df["sign"])
sign_transformed
<59941x49 sparse matrix of type '<class 'numpy.int64'>'
	with 59941 stored elements in Compressed Sparse Row format>
sign_df = pd.DataFrame(columns=sign.get_feature_names(), data=sign_transformed.toarray())

# Drop one column to prevent redundant information
sign_df = sign_df.drop(columns=['unknown_sign'])
sign_df.head(2)
aquarius aquariusanditmattersalot aquariusanditsfuntothinkabout aquariusbutitdoesntmatter aries ariesanditmattersalot ariesanditsfuntothinkabout ariesbutitdoesntmatter cancer canceranditmattersalot ... scorpioanditsfuntothinkabout scorpiobutitdoesntmatter taurus taurusanditmattersalot taurusanditsfuntothinkabout taurusbutitdoesntmatter virgo virgoanditmattersalot virgoanditsfuntothinkabout virgobutitdoesntmatter
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 48 columns

sig = sign_df.columns.tolist()

for s in sig:
    if s.endswith('butitdoesntmatter'):
        sig.remove(s)

for s in sig:
    if s.endswith('anditmattersalot'):
        sig.remove(s)    
        
for s in sig:
    if s.endswith('anditsfuntothinkabout'):
        sig.remove(s)
signs = sig

signs_doesntmatter = []
for sign in signs:
    signs_doesntmatter.append(sign+'butitdoesntmatter')
    
signs_matters = []
for sign in signs:
    signs_matters.append(sign+'anditmattersalot')
    
signs_fun = []
for sign in signs:
    signs_fun.append(sign+'anditsfuntothinkabout')
for sign in signs:
    if sign in sign_df.columns:
        sign_df.loc[sign_df[sign] == 1, sign] = 3
        
for sign, sign_doesntmatter in zip(signs, signs_doesntmatter):
    if sign_doesntmatter in sign_df.columns:
        sign_df.loc[sign_df[sign_doesntmatter] == 1, sign] = 2
        sign_df = sign_df.drop(columns=sign_doesntmatter)

for sign, sign_matters in zip(signs, signs_matters):
    if sign_matters in sign_df.columns:
        sign_df.loc[sign_df[sign_matters] == 1, sign] = 3
        sign_df = sign_df.drop(columns=sign_matters)
        
for sign, sign_fun in zip(signs, signs_fun):
    if sign_fun in sign_df.columns:
        sign_df.loc[sign_df[sign_fun] == 1, sign] = 1
        sign_df = sign_df.drop(columns=sign_fun)
sign_df.head(2)
aquarius aries cancer capricorn gemini leo libra pisces sagittarius scorpio taurus virgo
0 0 0 0 0 3 0 0 0 0 0 0 0
1 0 0 3 0 0 0 0 0 0 0 0 0
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, sign_df], axis=1)
ok_cupid_df.drop(columns='sign', inplace=True)
ok_cupid_df.head(2)
age height income last_online speaks essay essay_len male is_religious is_agnostic ... cancer capricorn gemini leo libra pisces sagittarius scorpio taurus virgo
0 22 75.0 -1 2012-06-28-20-30 english about me i would love to think that i was som... 2389 1 0 1 ... 0 0 3 0 0 0 0 0 0 0
1 35 70.0 80000 2012-06-29-21-41 english (fluently), spanish (poorly), french (... i am a chef this is what that means 1 i am a w... 1340 1 0 1 ... 3 0 0 0 0 0 0 0 0 0

2 rows × 436 columns

Speaks (languages):

ok_cupid_df['speaks'] = ok_cupid_df['speaks'].str.replace(' ', '').str.replace('(', '_').str.replace(')', '').str.replace(',', ' ').str.replace('_', '')
ok_cupid_df['speaks'].value_counts()
english                                                                      21826
englishfluently                                                               6627
englishfluently spanishpoorly                                                 2059
englishfluently spanishokay                                                   1917
englishfluently spanishfluently                                               1288
                                                                             ...  
englishfluently norwegianfluently swedishokay germanokay                         1
english spanish portuguese thai                                                  1
englishfluently frenchokay bulgarianpoorly chechenpoorly chinesepoorly           1
englishfluently chinesepoorly vietnamesepoorly japanesepoorly spanishokay        1
englishfluently chinesefluently japanesepoorly c++fluently otherfluently         1
Name: speaks, Length: 7648, dtype: int64
speaks = CountVectorizer()

speaks_transformed = speaks.fit_transform(ok_cupid_df["speaks"])
speaks_transformed
<59941x302 sparse matrix of type '<class 'numpy.int64'>'
	with 110527 stored elements in Compressed Sparse Row format>
speaks_df = pd.DataFrame(columns=speaks.get_feature_names(), data=speaks_transformed.toarray())

# Drop one column to prevent redundant information
speaks_df = speaks_df.drop(columns=['unknownspeaks', 'poorly', 'fluently', 'okay', 'lisp', 'lispokay', 'lisppoorly', 'lispfluently'])
speaks_df.head(2)
afrikaans afrikaansfluently afrikaansokay afrikaanspoorly albanian albanianfluently albanianokay albanianpoorly ancientgreek ancientgreekfluently ... vietnameseokay vietnamesepoorly welsh welshfluently welshokay welshpoorly yiddish yiddishfluently yiddishokay yiddishpoorly
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 294 columns

speaks_df = speaks_df.rename(columns={'armenianfluently': 'armenian', 'slovenianfluently': 'slovenian', 'sardinianfluently': 'sardinian'})
lang = speaks_df.columns.tolist()

for l in lang:
    if l.endswith('fluently'):
        lang.remove(l)

for l in lang:
    if l.endswith('poorly'):
        lang.remove(l)    
        
for l in lang:
    if l.endswith('okay'):
        lang.remove(l)
languages = lang

languages_fluently = []
for language in languages:
    languages_fluently.append(language+'fluently')
    
languages_okay = []
for language in languages:
    languages_okay.append(language+'okay')
    
languages_poorly = []
for language in languages:
    languages_poorly.append(language+'poorly')
for language in languages:
    if language in speaks_df.columns:
        speaks_df.loc[speaks_df[language] == 1, language] = 3
        
for language, language_fluent in zip(languages, languages_fluently):
    if language_fluent in speaks_df.columns:
        speaks_df.loc[speaks_df[language_fluent] == 1, language] = 3
        speaks_df = speaks_df.drop(columns=language_fluent)

for language, language_okay in zip(languages, languages_okay):
    if language_okay in speaks_df.columns:
        speaks_df.loc[speaks_df[language_okay] == 1, language] = 2
        #speaks_df.loc[speaks_df['slovenian'] == 1, 'slovenian'] = 2
        speaks_df = speaks_df.drop(columns=language_okay)
        
for language, language_poorly in zip(languages, languages_poorly):
    if language_poorly in speaks_df.columns:
        speaks_df.loc[speaks_df[language_poorly] == 1, language] = 1
        speaks_df = speaks_df.drop(columns=language_poorly)
speaks_df.head(2)
afrikaans albanian ancientgreek arabic armenian basque belarusan bengali breton bulgarian ... tagalog tamil thai tibetan turkish ukrainian urdu vietnamese welsh yiddish
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 75 columns

# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, speaks_df], axis=1)
ok_cupid_df.drop(columns='speaks', inplace=True)
ok_cupid_df.head(2)
age height income last_online essay essay_len male is_religious is_agnostic is_atheist ... tagalog tamil thai tibetan turkish ukrainian urdu vietnamese welsh yiddish
0 22 75.0 -1 2012-06-28-20-30 about me i would love to think that i was som... 2389 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
1 35 70.0 80000 2012-06-29-21-41 i am a chef this is what that means 1 i am a w... 1340 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 510 columns

Drop income:

(too many null values)

ok_cupid_df.drop(columns='income', inplace=True)

Last online:

ok_cupid_df['last_online']
0        2012-06-28-20-30
1        2012-06-29-21-41
2        2012-06-27-09-10
3        2012-06-28-14-22
4        2012-06-27-21-26
               ...       
59936    2012-06-12-21-47
59937    2012-06-29-11-01
59938    2012-06-27-23-37
59939    2012-06-23-13-01
59940    2012-06-29-00-42
Name: last_online, Length: 59941, dtype: object
ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0].astype('int')
ok_cupid_df['last_online_month'] = ok_cupid_df['last_online'].str.split("-", expand=True)[1].astype('int')
last_online_datetime = (ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]).astype('datetime64')
ok_cupid_df['last_online_weekday'] = last_online_datetime.dt.weekday
ok_cupid_df['last_online_weekday'] = np.where(ok_cupid_df['last_online_weekday'] < 5, 1, 0)
ok_cupid_df['last_online_weekday']
0        1
1        1
2        1
3        1
4        1
        ..
59936    1
59937    1
59938    1
59939    0
59940    1
Name: last_online_weekday, Length: 59941, dtype: int64
#ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0]
ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]
0        2012-06-28
1        2012-06-29
2        2012-06-27
3        2012-06-28
4        2012-06-27
            ...    
59936    2012-06-12
59937    2012-06-29
59938    2012-06-27
59939    2012-06-23
59940    2012-06-29
Length: 59941, dtype: object
ok_cupid_df.drop(columns=['last_online', 'is_agnostic'], inplace=True)
ok_cupid_df.head(2)
age height essay essay_len male is_religious is_atheist is_straight gay straight ... tibetan turkish ukrainian urdu vietnamese welsh yiddish last_online_year last_online_month last_online_weekday
0 22 75.0 about me i would love to think that i was som... 2389 1 0 0 1 0 1 ... 0 0 0 0 0 0 0 2012 6 1
1 35 70.0 i am a chef this is what that means 1 i am a w... 1340 1 0 0 1 0 1 ... 0 0 0 0 0 0 0 2012 6 1

2 rows × 510 columns

ok_cupid_df = ok_cupid_df.drop(columns=['essay'])
ok_cupid_df.head(2)
age height essay_len male is_religious is_atheist is_straight gay straight available ... tibetan turkish ukrainian urdu vietnamese welsh yiddish last_online_year last_online_month last_online_weekday
0 22 75.0 2389 1 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 2012 6 1
1 35 70.0 1340 1 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 2012 6 1

2 rows × 509 columns

ok_cupid_df.isna().sum().sum()
0
ok_cupid_df.to_csv (r'data/okcupid_profiles_clean.csv', index = False, header=True)

Written on March 3, 2014