Click here to Skip to main content
15,887,746 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
I'm trying to shuffle my indices using the np.random.shuffle() method, but I keep getting an error that I don't understand. I'd really appreciate it if someone could help me puzzle this out. Thank you!


Here is my code:

#Goal: Preprocess the Data to Predict Excessive Employee absence

#Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


raw_csv_data= pd.read_csv('Absenteeism-data.csv')
print(raw_csv_data)

df= raw_csv_data.copy()
print(display(df))

pd.options.display.max_columns=None
pd.options.display.max_rows=None
print(display(df))

print(df.info())

df=df.drop(['ID'], axis=1)


print(display(df.head()))


#Our goal is to see who is more likely to be absent. Let's define
#our targets from our dependent variable, Absenteeism Time in Hours
print(df['Absenteeism Time in Hours'])
print(df['Absenteeism Time in Hours'].median())

targets= np.where(df['Absenteeism Time in Hours']>df['Absenteeism
Time in Hours'].median(),1,0)

print(targets)

df['Excessive Absenteeism']= targets

print(df.head())


#Let's Separate the Day and Month Values to see if there is
correlation
#between Day of week/month with absence
print(type(df['Date'][0]))

df['Date']= pd.to_datetime(df['Date'], format='%d/%m/%Y')

print(df['Date'])
print(type(df['Date'][0]))

#Extracting the Month Value
print(df['Date'][0].month)

list_months=[]
print(list_months)

print(df.shape)

for i in range(df.shape[0]):
list_months.append(df['Date'][i].month)

print(list_months)

print(len(list_months))

#Let's Create a Month Value Column for df
df['Month Value']= list_months

print(df.head())

#Now let's extract the day of the week from date
df['Date'][699].weekday()

def date_to_weekday(date_value):
    return date_value.weekday()

df['Day of the Week']= df['Date'].apply(date_to_weekday)

print(df.head())

df= df.drop(['Date'], axis=1)

print(df.columns.values)

reordered_columns= ['Reason for Absence', 'Month Value','Day of the
Week','Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets',
'Absenteeism Time in Hours', 'Excessive Absenteeism']

df=df[reordered_columns]
print(df.head())

#First Checkpoint
df_date_mod= df.copy()

print(df_date_mod)


#Let's Standardize our inputs, ignoring the Reasons and Education
Columns
#Because they are labelled by a separate categorical criteria, not
numerically
print(df_date_mod.columns.values)

unscaled_inputs= df_date_mod.loc[:, ['Month Value','Day of the
Week','Transportation Expense','Distance to Work','Age','Daily Work
Load Average','Body Mass Index','Children','Pets','Absenteeism Time
in Hours']]

print(display(unscaled_inputs))

absenteeism_scaler= StandardScaler()

absenteeism_scaler.fit(unscaled_inputs)

scaled_inputs= absenteeism_scaler.transform(unscaled_inputs)

print(display(scaled_inputs))

print(scaled_inputs.shape)

scaled_inputs= pd.DataFrame(scaled_inputs, columns=['Month
Value','Day of the Week','Transportation Expense','Distance to
Work','Age','Daily Work Load Average','Body Mass
Index','Children','Pets','Absenteeism Time in Hours'])
print(display(scaled_inputs))

df_date_mod= df_date_mod.drop(['Month Value','Day of the
Week','Transportation Expense','Distance to Work','Age','Daily Work
Load Average','Body Mass Index','Children','Pets','Absenteeism Time
in Hours'], axis=1)
print(display(df_date_mod))

df_date_mod=pd.concat([df_date_mod,scaled_inputs], axis=1)
print(display(df_date_mod))

df_date_mod= df_date_mod[reordered_columns]
print(display(df_date_mod.head()))

#Checkpoint
df_date_scale_mod= df_date_mod.copy()
print(display(df_date_scale_mod.head()))

#Let's Analyze the Reason for Absence Category
print(df_date_scale_mod['Reason for Absence'])

print(df_date_scale_mod['Reason for Absence'].min())
print(df_date_scale_mod['Reason for Absence'].max())

print(df_date_scale_mod['Reason for Absence'].unique())

print(len(df_date_scale_mod['Reason for Absence'].unique()))

print(sorted(df['Reason for Absence'].unique()))

reason_columns= pd.get_dummies(df['Reason for Absence'])
print(reason_columns)

reason_columns['check']= reason_columns.sum(axis=1)
print(reason_columns)

print(reason_columns['check'].sum(axis=0))

print(reason_columns['check'].unique())

reason_columns=reason_columns.drop(['check'], axis=1)
print(reason_columns)

reason_columns=pd.get_dummies(df_date_scale_mod['Reason for
Absence'], drop_first=True)
print(reason_columns)
#%%
print(df_date_scale_mod.columns.values)

print(reason_columns.columns.values)

df_date_scale_mod= df_date_scale_mod.drop(['Reason for Absence'],
axis=1)
print(df_date_scale_mod)

reason_type_1= reason_columns.loc[:, 1:14].max(axis=1)
reason_type_2= reason_columns.loc[:, 15:17].max(axis=1)
reason_type_3= reason_columns.loc[:, 18:21].max(axis=1)
reason_type_4= reason_columns.loc[:, 22:].max(axis=1)

print(reason_type_1)
print(reason_type_2)
print(reason_type_3)
print(reason_type_4)

print(df_date_scale_mod.head())

df_date_scale_mod= pd.concat([df_date_scale_mod,
reason_type_1,reason_type_2, reason_type_3, reason_type_4], axis=1)
print(df_date_scale_mod.head())

print(df_date_scale_mod.columns.values)

column_names= ['Month Value','Day of the Week','Transportation
Expense',
'Distance to Work','Age','Daily Work Load Average','Body Mass
Index',
'Education','Children','Pets','Absenteeism Time in Hours',
'Excessive Absenteeism', 'Reason_1', 'Reason_2', 'Reason_3',
'Reason_4']

df_date_scale_mod.columns= column_names
print(df_date_scale_mod.head())

column_names_reordered= ['Reason_1', 'Reason_2', 'Reason_3',
'Reason_4','Month Value','Day of the Week','Transportation Expense',
'Distance to Work','Age','Daily Work Load Average','Body Mass
Index',
'Education','Children','Pets','Absenteeism Time in Hours',
'Excessive Absenteeism']

df_date_scale_mod=df_date_scale_mod[column_names_reordered]
print(display(df_date_scale_mod.head()))

#Checkpoint
df_date_scale_mod_reas= df_date_scale_mod.copy()
print(df_date_scale_mod_reas.head())

#Let's Look at the Education column now
print(df_date_scale_mod_reas['Education'].unique())
#This shows us that education is rated from 1-4 based on level
#of completion

print(df_date_scale_mod_reas['Education'].value_counts())
#The overwhelming majority of workers are highschool educated, while
the rest have higher degrees

#We'll create our dummy variables as highschool and higher education
df_date_scale_mod_reas['Education']=
df_date_scale_mod_reas['Education'].map({1:0, 2:1, 3:1, 4:1})

print(df_date_scale_mod_reas['Education'].unique())

print(df_date_scale_mod_reas['Education'].value_counts())

#Checkpoint
df_preprocessed= df_date_scale_mod_reas.copy()
print(display(df_preprocessed.head()))


#Split Inputs from targets
scaled_inputs_all= df_preprocessed.loc[:,'Reason_1':'Absenteeism
Time in Hours']
print(display(scaled_inputs_all.head()))
print(scaled_inputs_all.shape)

targets_all= df_preprocessed.loc[:,'Excessive Absenteeism']
print(display(targets_all.head()))
print(targets_all.shape)

#Shuffle Inputs and targets
shuffled_indices= np.arange(scaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs= scaled_inputs_all[shuffled_indices]
shuffled_targets= targets_all[shuffled_indices]


Here is the error:

KeyError                                  Traceback (most recent call last)
 in 
      1 shuffled_indices= np.arange(scaled_inputs_all.shape[0])
      2 np.random.shuffle(shuffled_indices)
----> 3 shuffled_inputs= scaled_inputs_all[shuffled_indices]
      4 shuffled_targets= targets_all[shuffled_indices]

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2932                 key = list(key)
   2933             indexer = self.loc._convert_to_indexer(key, axis=1,
-> 2934                                                    raise_missing=True)
   2935 
   2936         # take() does not accept boolean indexers

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
   1352                 kwargs = {'raise_missing': True if is_setter else
   1353                           raise_missing}
-> 1354                 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
   1355         else:
   1356             try:

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1159         self._validate_read_indexer(keyarr, indexer,
   1160                                     o._get_axis_number(axis),
-> 1161                                     raise_missing=raise_missing)
   1162         return keyarr, indexer
   1163 

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1244                 raise KeyError(
   1245                     u"None of [{key}] are in the [{axis}]".format(
-> 1246                         key=key, axis=self.obj._get_axis_name(axis)))
   1247 
   1248             # We (temporarily) allow for some missing keys with .loc, except in

KeyError: "None of [Int64Index([560, 320, 405, 141, 154, 370, 656,  26, 444, 307,\n            ...\n            429, 542, 676, 588, 315, 284, 293, 607, 197, 250],\n           dtype='int64', length=700)] are in the [columns]"


What I have tried:

I've tried to use the delimiter=',' and delim_whitespace=0 (two solutions that i didn't understand anyway) when I made my raw_csv_data variable at the beginning, as I saw that as the solution of another problem, but it kept throwing the same error
Posted
Updated 13-Apr-19 6:23am
v2

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900