data sets are taken from
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from matplotlib import style | |
import os | |
df_in = pd.read_excel('child_deaths.xlsx', sheetname='Data') | |
df_in = df_in.set_index(['year']) | |
df_tr = df_in.transpose() | |
#remove nulls | |
counter_null = df_tr.isnull().sum() | |
counter_without_null = counter_null[counter_null == 0] | |
df_all = df_tr[counter_without_null.keys()] | |
# df_all.plot() | |
# plt.legend() | |
# plt.show() | |
country_in = 'Latvia' | |
df_corr = df_all.corr() | |
df_des = df_corr[country_in] | |
cou_names = list(df_all.columns.values) | |
cou_count = len(cou_names) | |
max_corr = 0 | |
max_corr_index = 0 | |
for i in range(cou_count - 1): | |
if 1 > abs(df_des[i]) > max_corr: | |
max_corr = df_des[i] | |
max_corr_index = i | |
# print(cou_names[max_corr_index]) | |
# print(max_corr) | |
# print(df_all[[country_in, cou_names[max_corr_index]]]) | |
df_most_corr = df_all[[country_in, cou_names[max_corr_index]]] | |
df_most_corr.plot() | |
plt.legend() | |
plt.show() | |
# df_reindex = df_reindex.set_index(['sdf']) | |
corr_dect = {} | |
os.chdir('./data/health') | |
# file_name = 'indicator_tbwithhivdeaths.xlsx' | |
# df = pd.read_excel(file_name, sheetname="Data", index_col=0) | |
# df_reindex = df.reindex(df.index.rename('year')) | |
# df_temp_tr = df_reindex.transpose() | |
# | |
# df_one = df_temp_tr[[country_in]] | |
# df_one.columns = ['{0}-{1}'.format(file_name.split('.')[0], country_in)] | |
# | |
# df_two = df_temp_tr[[cou_names[max_corr_index]]] | |
# df_two.columns = ['{0}-{1}'.format(file_name.split('.')[0], cou_names[max_corr_index])] | |
# | |
# result = pd.concat([df_one, df_two], axis=1) | |
# result_corr = result.corr() | |
# rs=result_corr['{0}-{1}'.format(file_name.split('.')[0], country_in)][1] | |
# corr_dect[file_name.split('.')[0]]=rs | |
# print(corr_dect) | |
# | |
for file_name in os.listdir(os.getcwd()): | |
if file_name.endswith('.xlsx'): | |
try: | |
df = pd.read_excel(file_name, sheetname="Data", index_col=0) | |
df_reindex = df.reindex(df.index.rename('year')) | |
df_temp_tr = df_reindex.transpose() | |
df_one = df_temp_tr[[country_in]] | |
df_one.columns = ['{0}-{1}'.format(file_name.split('.')[0], country_in)] | |
df_two = df_temp_tr[[cou_names[max_corr_index]]] | |
df_two.columns = ['{0}-{1}'.format(file_name.split('.')[0], cou_names[max_corr_index])] | |
result = pd.concat([df_one, df_two], axis=1) | |
# result_corr = result.corr() | |
# print(result_corr) | |
result_corr = result.corr() | |
rs=result_corr['{0}-{1}'.format(file_name.split('.')[0], country_in)][1] | |
corr_dect[file_name.split('.')[0]]=rs | |
except: | |
pass | |
des = 0.5 | |
for key, value in corr_dect.items(): | |
if abs(value) > des: | |
print("{0}:{1}".format(key, value) ) | |
Environmental, social and economic changes depend on many factors which can be correlated to other or may not be correlated at all, but we can’t say that any strong correlation is a result of causation. Our work is devoted to find the hidden relation between these variables over a known period of time using big data analysis techniques.
Recently the number of diseases, such as cancer and obesity, has been increasing. In addition, the number of people dying unexpectedly has also risen; without necessarily knowing the correct reason of this phenomenon. It is believed that the reason of these diseases and sudden death is the increase of genetically modified foods in the market. Other believes that the reason is that people stop practicing sport and work all the time. Other group of people thinks that the real reason that people does not release the stress of the work. A plethora of thoughts around the real reason of this phenomenon but the real reason is hidden. A lot of phenomenon, in different field in our life, has hidden reasons but people do not pay attention to try to find the real reason because there are a lot of uncertain reasons around then and they think that it is very hard the find the real one
The aim of this work is to create a system capable of developing reliable relation between data. Consequently, prediction the main reason of specific phenomena from different unrelated reasons such as; age at 1st marriage(women), agricultural land(% of land area), all causes deaths in children 1-59 months (per 1000 births),cervical cancer new cases per 100,000 women, C02 emissions (tones per person)…. etc.in the same countries over any chosen period of time.
This system can provide analysis for environmental, social, and economic data dependency to control unwanted changes and crises.
Simply the system is devoted to know the causes of any environmental, social and economic change in a specific country based on searching for another country that correlate with the country of our interest.
Statistical rules provide us with an important concept “It is hard to take a decision depending on correlation as it can't detect the causation”, So we tries to take a short cut to detect the causation, by comparing countries similar behavior of the data of our interest over time.
This software basically depends on a big data analysis then it is assimilation and it provides the causes of a data behavior. Our code which has been written using the programming language Python using pandas module, matplotlib, numpy. In this code, the user can enters the data he wants to know its causes in a chosen country, then the software searches for another country that it correlate with the entered one, then seek out of our database for two highly correlated changes that can be a cause of the data of our interest.
So we can say that the final output data was a cause of the previously entered ones by the user.
Our idea is getting bigger !!!!
import pandas as pd | |
import matplotlib.pyplot as plt | |
import datetime | |
import numpy as np | |
from matplotlib import style | |
data = input("Enter the desired search data\n") | |
ratio=float(input("Enter the ratio:")) | |
data_correlated_ratio = {} | |
category_dict = { | |
'climate' : ['temp','hum'], | |
'health' : ['disease'], | |
'agr':['food'] | |
} | |
df_pr = pd.read_csv('pr_dict.csv').set_index('index') | |
print(df_pr['health'][0]) | |
def get_redundant_pairs(df): | |
'''Get diagonal and lower triangular pairs of correlation matrix''' | |
pairs_to_drop = set() | |
cols = df.columns | |
for i in range(0, df.shape[1]): | |
for j in range(0, i+1): | |
pairs_to_drop.add((cols[i], cols[j])) | |
return pairs_to_drop | |
def get_top_abs_correlations(df, n=5): | |
au_corr = df.corr().abs().unstack() | |
labels_to_drop = get_redundant_pairs(df) | |
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False) | |
return au_corr[0:1] | |
def append_category(category,item): | |
for i,j in category_dict.items(): | |
if i == category : | |
j.append(item) | |
def rate(data_correlated_ratio,category_dict): | |
for desired in data_correlated_ratio: | |
for cat, var in category_dict.items(): | |
for ele in var: | |
if ele == desired: | |
pass | |
def find_data_cat(data, dict): | |
for cat, var in category_dict.items(): | |
for ele in var: | |
if ele == data: | |
return cat | |
append_category('health','noise') | |
df = pd.read_csv('p.csv') | |
C_names=list(df.columns.values) | |
no=len(C_names) | |
corr_list = [] | |
ind_list = [] | |
#print(df.head()) | |
#df=df.drop(df.columns[[1,5,6,7]], axis=1) | |
#print(df.head()) | |
df.set_index('year',inplace = True) | |
core=df.corr() | |
desired = core[data] | |
g=0 | |
for i in range(no-1): | |
corr_list.append(abs(desired[i])) | |
#print(corr_list) | |
for i in corr_list: | |
if (i > g and i <1) : | |
g=i | |
index = corr_list.index(i) | |
#print(index) | |
for i in corr_list: | |
if (i>=ratio and i < 1): | |
ind_list.append(corr_list.index(i)) | |
n =0 | |
for i in corr_list: | |
data_correlated_ratio[C_names[n]]=[i] | |
n=n+1 | |
#print(corr_list) | |
#print(C_names) | |
#print(data_correlated_ratio) | |
#------------------ getting the top related data and it's ratio -------------------# | |
top_related_data = C_names[index+1] | |
print("Top related data is: " + top_related_data) | |
print("top related data above the selected correlation ratio are :") | |
data_correlated_ratio.pop('year') | |
for i,j in data_correlated_ratio.items(): | |
# if (j[0] >= ratio and j < 1): | |
print("{0}--->:{1}".format(i,j[0])) | |
#print(ind_list) | |
# print(find_data_cat(data, category_dict)) | |
cols = core.columns | |
#print(desired) | |
#print(cols[0]) | |
#print(type(desired)) | |
desired = core[:1] | |
y=list(desired.columns.values) | |
#print(core[:1]) | |
#print(core.loc['temp',:]) | |
#print(df.corr()) | |
# print("Top Absolute Correlations") | |
# print(get_top_abs_correlations(df, 3)) | |
rate(data_correlated_ratio,category_dict) | |
df.plot() | |
plt.legend() | |
plt.show() |
SpaceApps is a NASA incubator innovation program.