Space Apps Challenge 2017

Monochromatic | Live Smart

Alexandria

Updates

Project

Members

Team Updates

data sets are taken from

http://www.gapminder.org/data/

islam ayman

      
        import numpy as np
      
        import matplotlib.pyplot as plt
      
        import pandas as pd
      
        from matplotlib import style
      
        import os
      
        df_in = pd.read_excel('child_deaths.xlsx', sheetname='Data')
      
        df_in = df_in.set_index(['year'])
      
        df_tr = df_in.transpose()
      
        #remove nulls
      
        counter_null = df_tr.isnull().sum()
      
        counter_without_null = counter_null[counter_null == 0]
      
        df_all = df_tr[counter_without_null.keys()]
      
        # df_all.plot()
      
        # plt.legend()
      
        # plt.show()
      
        country_in = 'Latvia'
      
        df_corr = df_all.corr()
      
        df_des = df_corr[country_in]
      
        cou_names = list(df_all.columns.values)
      
        cou_count = len(cou_names)
      
        max_corr = 0
      
        max_corr_index = 0
      
        for i in range(cou_count - 1):
      
            if 1 > abs(df_des[i]) > max_corr:
      
                max_corr = df_des[i]
      
                max_corr_index = i
      
        # print(cou_names[max_corr_index])
      
        # print(max_corr)
      
        # print(df_all[[country_in, cou_names[max_corr_index]]])
      
        df_most_corr = df_all[[country_in, cou_names[max_corr_index]]]
      
        df_most_corr.plot()
      
        plt.legend()
      
        plt.show()
      
        # df_reindex = df_reindex.set_index(['sdf'])
      
        corr_dect = {}
      
        os.chdir('./data/health')
      
        # file_name = 'indicator_tbwithhivdeaths.xlsx'
      
        # df = pd.read_excel(file_name, sheetname="Data", index_col=0)
      
        # df_reindex = df.reindex(df.index.rename('year'))
      
        # df_temp_tr = df_reindex.transpose()
      
        #
      
        # df_one = df_temp_tr[[country_in]]
      
        # df_one.columns = ['{0}-{1}'.format(file_name.split('.')[0], country_in)]
      
        #
      
        # df_two = df_temp_tr[[cou_names[max_corr_index]]]
      
        # df_two.columns = ['{0}-{1}'.format(file_name.split('.')[0], cou_names[max_corr_index])]
      
        #
      
        # result = pd.concat([df_one, df_two], axis=1)
      
        # result_corr = result.corr()
      
        # rs=result_corr['{0}-{1}'.format(file_name.split('.')[0], country_in)][1]
      
        # corr_dect[file_name.split('.')[0]]=rs
      
        # print(corr_dect)
      
        #
      
        for file_name in os.listdir(os.getcwd()):
      
            if file_name.endswith('.xlsx'):
      
                try:
      
                    df = pd.read_excel(file_name, sheetname="Data", index_col=0)
      
                    df_reindex = df.reindex(df.index.rename('year'))
      
                    df_temp_tr = df_reindex.transpose()
      
                    df_one = df_temp_tr[[country_in]]
      
                    df_one.columns = ['{0}-{1}'.format(file_name.split('.')[0], country_in)]
      
                    df_two = df_temp_tr[[cou_names[max_corr_index]]]
      
                    df_two.columns = ['{0}-{1}'.format(file_name.split('.')[0], cou_names[max_corr_index])]
      
                    result = pd.concat([df_one, df_two], axis=1)
      
                    # result_corr = result.corr()
      
                    # print(result_corr)
      
                    result_corr = result.corr()
      
                    rs=result_corr['{0}-{1}'.format(file_name.split('.')[0], country_in)][1]
      
                    corr_dect[file_name.split('.')[0]]=rs
      
                except:
      
                    pass
      
        des = 0.5
      
        for key, value in corr_dect.items():
      
            if abs(value) > des:
      
                print("{0}:{1}".format(key, value) )

view raw x-factor.py hosted with ❤ by GitHub

islam ayman

Software implementation

Chaymae Ahmed

Technical documentation

Chaymae Ahmed

Abstract

Environmental, social and economic changes depend on many factors which can be correlated to other or may not be correlated at all, but we can’t say that any strong correlation is a result of causation. Our work is devoted to find the hidden relation between these variables over a known period of time using big data analysis techniques.

Motivation

Recently the number of diseases, such as cancer and obesity, has been increasing. In addition, the number of people dying unexpectedly has also risen; without necessarily knowing the correct reason of this phenomenon. It is believed that the reason of these diseases and sudden death is the increase of genetically modified foods in the market. Other believes that the reason is that people stop practicing sport and work all the time. Other group of people thinks that the real reason that people does not release the stress of the work. A plethora of thoughts around the real reason of this phenomenon but the real reason is hidden. A lot of phenomenon, in different field in our life, has hidden reasons but people do not pay attention to try to find the real reason because there are a lot of uncertain reasons around then and they think that it is very hard the find the real one

Aim of the Work

The aim of this work is to create a system capable of developing reliable relation between data. Consequently, prediction the main reason of specific phenomena from different unrelated reasons such as; age at 1^st marriage(women), agricultural land(% of land area), all causes deaths in children 1-59 months (per 1000 births),cervical cancer new cases per 100,000 women, C02 emissions (tones per person)…. etc.in the same countries over any chosen period of time.
This system can provide analysis for environmental, social, and economic data dependency to control unwanted changes and crises.

concept of the system

Simply the system is devoted to know the causes of any environmental, social and economic change in a specific country based on searching for another country that correlate with the country of our interest.

Software implementation

Statistical rules provide us with an important concept “It is hard to take a decision depending on correlation as it can't detect the causation”, So we tries to take a short cut to detect the causation, by comparing countries similar behavior of the data of our interest over time.

This software basically depends on a big data analysis then it is assimilation and it provides the causes of a data behavior. Our code which has been written using the programming language Python using pandas module, matplotlib, numpy. In this code, the user can enters the data he wants to know its causes in a chosen country, then the software searches for another country that it correlate with the entered one, then seek out of our database for two highly correlated changes that can be a cause of the data of our interest.

So we can say that the final output data was a cause of the previously entered ones by the user.

Chaymae Ahmed

It was an unforgettable moment

Chaymae Ahmed

Our idea is getting bigger !!!!

Chaymae Ahmed

      
        import pandas as pd
      
        import matplotlib.pyplot as plt
      
        import datetime
      
        import numpy as np
      
        from matplotlib import style
      
        data = input("Enter the desired search data\n")
      
        ratio=float(input("Enter the ratio:"))
      
        data_correlated_ratio = {}
      
        category_dict = {
      
            'climate' : ['temp','hum'],
      
            'health' : ['disease'],
      
            'agr':['food']
      
        }
      
        df_pr = pd.read_csv('pr_dict.csv').set_index('index')
      
        print(df_pr['health'][0])
      
        def get_redundant_pairs(df):
      
            '''Get diagonal and lower triangular pairs of correlation matrix'''
      
            pairs_to_drop = set()
      
            cols = df.columns
      
            for i in range(0, df.shape[1]):
      
                for j in range(0, i+1):
      
                    pairs_to_drop.add((cols[i], cols[j]))
      
            return pairs_to_drop
      
        def get_top_abs_correlations(df, n=5):
      
            au_corr = df.corr().abs().unstack()
      
            labels_to_drop = get_redundant_pairs(df)
      
            au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
      
            return au_corr[0:1]
      
        def append_category(category,item):
      
            for i,j in category_dict.items():
      
                if i == category :
      
                    j.append(item)
      
        def rate(data_correlated_ratio,category_dict):
      
            for desired in data_correlated_ratio:
      
                for cat, var in category_dict.items():
      
                    for ele in var:
      
                        if ele == desired:
      
                            pass
      
        def find_data_cat(data, dict):
      
            for cat, var in category_dict.items():
      
                for ele in var:
      
                    if ele == data:
      
                        return cat
      
        append_category('health','noise')
      
        df = pd.read_csv('p.csv')
      
        C_names=list(df.columns.values)
      
        no=len(C_names)
      
        corr_list = []
      
        ind_list  = []
      
        #print(df.head())
      
        #df=df.drop(df.columns[[1,5,6,7]], axis=1)
      
        #print(df.head())
      
        df.set_index('year',inplace = True)
      
        core=df.corr()
      
        desired = core[data]
      
        g=0
      
        for i in range(no-1):
      
            corr_list.append(abs(desired[i]))
      
        #print(corr_list)
      
        for i in corr_list:
      
            if (i > g and  i <1) :
      
                g=i
      
                index = corr_list.index(i)
      
        #print(index)
      
        for i in corr_list:
      
            if (i>=ratio and i < 1):
      
                ind_list.append(corr_list.index(i))
      
        n =0
      
        for i in corr_list:
      
            data_correlated_ratio[C_names[n]]=[i]
      
            n=n+1
      
        #print(corr_list)
      
        #print(C_names)
      
        #print(data_correlated_ratio)
      
        #------------------ getting the top related data and it's ratio -------------------#
      
        top_related_data = C_names[index+1]
      
        print("Top related data is: " + top_related_data)
      
        print("top related  data above the selected correlation ratio are :")
      
        data_correlated_ratio.pop('year')
      
        for i,j in data_correlated_ratio.items():
      
            # if (j[0] >= ratio and j < 1):
      
                print("{0}--->:{1}".format(i,j[0]))
      
        #print(ind_list)
      
        # print(find_data_cat(data, category_dict))
      
        cols = core.columns
      
        #print(desired)
      
        #print(cols[0])
      
        #print(type(desired))
      
        desired = core[:1]
      
        y=list(desired.columns.values)
      
        #print(core[:1])
      
        #print(core.loc['temp',:])
      
        #print(df.corr())
      
        # print("Top Absolute Correlations")
      
        # print(get_top_abs_correlations(df, 3))
      
        rate(data_correlated_ratio,category_dict)
      
        df.plot()
      
        plt.legend()
      
        plt.show()

view raw x-factor.py hosted with ❤ by GitHub

islam ayman

team photo ^_^

Mohamed.M.Lotfy

SpaceApps is a NASA incubator innovation program.

	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	from matplotlib import style
	import os

	df_in = pd.read_excel('child_deaths.xlsx', sheetname='Data')
	df_in = df_in.set_index(['year'])
	df_tr = df_in.transpose()

	#remove nulls
	counter_null = df_tr.isnull().sum()
	counter_without_null = counter_null[counter_null == 0]
	df_all = df_tr[counter_without_null.keys()]
	# df_all.plot()
	# plt.legend()
	# plt.show()

	country_in = 'Latvia'
	df_corr = df_all.corr()
	df_des = df_corr[country_in]

	cou_names = list(df_all.columns.values)
	cou_count = len(cou_names)
	max_corr = 0
	max_corr_index = 0

	for i in range(cou_count - 1):
	if 1 > abs(df_des[i]) > max_corr:
	max_corr = df_des[i]
	max_corr_index = i

	# print(cou_names[max_corr_index])
	# print(max_corr)
	# print(df_all[[country_in, cou_names[max_corr_index]]])
	df_most_corr = df_all[[country_in, cou_names[max_corr_index]]]
	df_most_corr.plot()
	plt.legend()
	plt.show()
	# df_reindex = df_reindex.set_index(['sdf'])
	corr_dect = {}
	os.chdir('./data/health')

	# file_name = 'indicator_tbwithhivdeaths.xlsx'
	# df = pd.read_excel(file_name, sheetname="Data", index_col=0)
	# df_reindex = df.reindex(df.index.rename('year'))
	# df_temp_tr = df_reindex.transpose()
	#
	# df_one = df_temp_tr[[country_in]]
	# df_one.columns = ['{0}-{1}'.format(file_name.split('.')[0], country_in)]
	#
	# df_two = df_temp_tr[[cou_names[max_corr_index]]]
	# df_two.columns = ['{0}-{1}'.format(file_name.split('.')[0], cou_names[max_corr_index])]
	#
	# result = pd.concat([df_one, df_two], axis=1)
	# result_corr = result.corr()
	# rs=result_corr['{0}-{1}'.format(file_name.split('.')[0], country_in)][1]
	# corr_dect[file_name.split('.')[0]]=rs
	# print(corr_dect)
	#
	for file_name in os.listdir(os.getcwd()):
	if file_name.endswith('.xlsx'):
	try:
	df = pd.read_excel(file_name, sheetname="Data", index_col=0)
	df_reindex = df.reindex(df.index.rename('year'))
	df_temp_tr = df_reindex.transpose()

	df_one = df_temp_tr[[country_in]]
	df_one.columns = ['{0}-{1}'.format(file_name.split('.')[0], country_in)]

	df_two = df_temp_tr[[cou_names[max_corr_index]]]
	df_two.columns = ['{0}-{1}'.format(file_name.split('.')[0], cou_names[max_corr_index])]

	result = pd.concat([df_one, df_two], axis=1)
	# result_corr = result.corr()
	# print(result_corr)

	result_corr = result.corr()
	rs=result_corr['{0}-{1}'.format(file_name.split('.')[0], country_in)][1]
	corr_dect[file_name.split('.')[0]]=rs
	except:
	pass
	des = 0.5
	for key, value in corr_dect.items():
	if abs(value) > des:
	print("{0}:{1}".format(key, value) )

	import pandas as pd
	import matplotlib.pyplot as plt
	import datetime
	import numpy as np
	from matplotlib import style

	data = input("Enter the desired search data\n")
	ratio=float(input("Enter the ratio:"))

	data_correlated_ratio = {}

	category_dict = {
	'climate' : ['temp','hum'],
	'health' : ['disease'],
	'agr':['food']
	}
	df_pr = pd.read_csv('pr_dict.csv').set_index('index')
	print(df_pr['health'][0])

	def get_redundant_pairs(df):
	'''Get diagonal and lower triangular pairs of correlation matrix'''
	pairs_to_drop = set()
	cols = df.columns
	for i in range(0, df.shape[1]):
	for j in range(0, i+1):
	pairs_to_drop.add((cols[i], cols[j]))
	return pairs_to_drop

	def get_top_abs_correlations(df, n=5):
	au_corr = df.corr().abs().unstack()
	labels_to_drop = get_redundant_pairs(df)
	au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
	return au_corr[0:1]

	def append_category(category,item):
	for i,j in category_dict.items():
	if i == category :
	j.append(item)

	def rate(data_correlated_ratio,category_dict):
	for desired in data_correlated_ratio:
	for cat, var in category_dict.items():
	for ele in var:
	if ele == desired:
	pass


	def find_data_cat(data, dict):
	for cat, var in category_dict.items():
	for ele in var:
	if ele == data:
	return cat


	append_category('health','noise')

	df = pd.read_csv('p.csv')
	C_names=list(df.columns.values)
	no=len(C_names)
	corr_list = []
	ind_list = []
	#print(df.head())
	#df=df.drop(df.columns[[1,5,6,7]], axis=1)
	#print(df.head())


	df.set_index('year',inplace = True)
	core=df.corr()
	desired = core[data]
	g=0
	for i in range(no-1):
	corr_list.append(abs(desired[i]))
	#print(corr_list)


	for i in corr_list:
	if (i > g and i <1) :
	g=i
	index = corr_list.index(i)
	#print(index)

	for i in corr_list:
	if (i>=ratio and i < 1):
	ind_list.append(corr_list.index(i))
	n =0
	for i in corr_list:
	data_correlated_ratio[C_names[n]]=[i]
	n=n+1

	#print(corr_list)
	#print(C_names)
	#print(data_correlated_ratio)
	#------------------ getting the top related data and it's ratio -------------------#
	top_related_data = C_names[index+1]
	print("Top related data is: " + top_related_data)
	print("top related data above the selected correlation ratio are :")
	data_correlated_ratio.pop('year')
	for i,j in data_correlated_ratio.items():
	# if (j[0] >= ratio and j < 1):
	print("{0}--->:{1}".format(i,j[0]))



	#print(ind_list)

	# print(find_data_cat(data, category_dict))

	cols = core.columns
	#print(desired)
	#print(cols[0])
	#print(type(desired))
	desired = core[:1]
	y=list(desired.columns.values)

	#print(core[:1])
	#print(core.loc['temp',:])


	#print(df.corr())
	# print("Top Absolute Correlations")
	# print(get_top_abs_correlations(df, 3))

	rate(data_correlated_ratio,category_dict)

	df.plot()
	plt.legend()
	plt.show()