7. Prediction¶
7.1. Node Attribute¶
Predict node attributes based on centrality properties.
First, parse all the data into a dataframe so that sklearn package can be used. Centrality metrics are also calculated and parsed.
# Your Code Here
df = pd.DataFrame(index=G.nodes())
df['dept'] = pd.Series(nx.get_node_attributes(G, 'Department'))
df['mgmtsalary'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
df['deg'] = pd.Series(nx.degree_centrality(G))
df['btw'] = pd.Series(nx.betweenness_centrality(G))
df['close'] = pd.Series(nx.closeness_centrality(G))
df['cluster'] = pd.Series(nx.clustering(G))
# split train sets
dftrain = df[df['mgmtsalary'].notnull()]
X = dftrain[['dept', 'deg', 'btw', 'close', 'cluster']]
y = dftrain['mgmtsalary']
# split test sets
dftest = df[df['mgmtsalary'].isnull()]
test = dftest[['dept', 'deg', 'btw', 'close', 'cluster']]
Then, data is split and analysed using logistic regression within a Grid Search. An AUC score of 0.94 is obtained.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Normalisation
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test = scaler.transform(test)
clf = LogisticRegression()
# range of regularisation values
grid_values = {'C': [100, 110, 120, 122, 125, 140, 150, 200, 250, 300, 320, 350, 400, 420]}
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc', cv=3)
grid_clf_auc.fit(X_train, y_train)
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)
# Grid best parameter (max. AUC): {'C': 400}
# Grid best score (AUC): 0.936997156006
After determining the best C value, with a reasonable AUC score, will pass this to the final test data which has unknown y-value.
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
def salary_predictions():
# node attributes to dataframe
df = pd.DataFrame(index=G.nodes())
df['dept'] = pd.Series(nx.get_node_attributes(G, 'Department'))
df['mgmtsalary'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
# 4 centrality measures
df['deg'] = pd.Series(nx.degree_centrality(G))
df['btw'] = pd.Series(nx.betweenness_centrality(G))
df['close'] = pd.Series(nx.closeness_centrality(G))
df['cluster'] = pd.Series(nx.clustering(G))
# split train sets
dftrain = df[df['mgmtsalary'].notnull()]
X = dftrain[['dept', 'deg', 'btw', 'close', 'cluster']]
y = dftrain['mgmtsalary']
# split test sets
dftest = df[df['mgmtsalary'].isnull()]
test = dftest[['dept', 'deg', 'btw', 'close', 'cluster']]
# Normalisation
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)
# create model
clf = LogisticRegression(C=400)
clf.fit(X, y)
# predict test
result = clf.predict_proba(test)
output = pd.Series(result[:,1], index=dftest.index)
return output
7.2. Future Edge Linkage¶
The dataset provides a list of future linkages and ask to provide prediction for others which have not been given.
First, parse all the data into a dataframe so that sklearn package can be used. Link metrics are also calculated.
fc['prefa'] = [i[2] for i in nx.preferential_attachment(G, fc.index)]
fc['cneigh'] = fc.index.map(lambda x: len(list(nx.common_neighbors(G, x[0], x[1]))))
# split train sets
fctrain = fc[fc['Future Connection'].notnull()]
X = fctrain[['prefa', 'cneigh']]
y = fctrain['Future Connection']
# split test sets
fctest = fc[fc['Future Connection'].isnull()]
test = fctest[['prefa', 'cneigh']]
Then, data is split and analysed using logistic regression within a Grid Search. An AUC score of 0.91 is obtained.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Normalisation
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test = scaler.transform(test)
clf = LogisticRegression()
grid_values = {'C': [1, 10, 20, 30, 40, 50, 100, 110, 120, 122, 125, 140, 150]}
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc', cv=3)
grid_clf_auc.fit(X_train, y_train)
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)
# Grid best parameter (max. AUC): {'C': 10}
# Grid best score (AUC): 0.905822220111
After determining the best C value, with a reasonable AUC score, will pass this to the final test data which has unknown y-value.
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
def new_connections_predictions():
# add edge attributes
fc['prefa'] = [i[2] for i in nx.preferential_attachment(G, fc.index)]
fc['cneigh'] = fc.index.map(lambda x: len(list(nx.common_neighbors(G, x[0], x[1]))))
# split train sets
fctrain = fc[fc['Future Connection'].notnull()]
X = fctrain[['prefa', 'cneigh']]
y = fctrain['Future Connection']
# split test sets
fctest = fc[fc['Future Connection'].isnull()]
test = fctest[['prefa', 'cneigh']]
# Normalisation
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)
# build model
clf = LogisticRegression(C=10)
clf.fit(X, y)
# predict
result = clf.predict_proba(test)
output = pd.Series(result[:,1], index=fctest.index)
return output