# load necessary packages
import pandas as pd import contextlib import urllib.request import gzip
def DownloadAndProcessPathwayCommonsData(filename="PathwayCommons.csv"): """Download PathwayCommons data from txt format - long .sif file (see details here: https://www.pathwaycommons.org/pc2/formats) The data are converted into DataFrame.
read_csv method from pandas library is applied to load a small chunk of data (1000 rows of txt file). Next the data are filtered
as follows:
* only those edges of the network are taken which has in interaction_type column only those values listed in 'interactions' vector
* column MEDIATOR_IDS is deleted
Keyword arguments:
filename --- path to the file where data will be downloaded, as a default in the current directory under the name "PathwayCommons.csv"
Outputs:
result --- a text "data downloaded" which is printed when data are downloaded successfully
"""
# url where PathwayCommons is located
url = "https://www.pathwaycommons.org/archives/PC2/v12/PathwayCommons12.All.hgnc.txt.gz"
# list with all important interactions type
interactions = ["controls-expression-of","interacts-with", "controls-phosphorylation-of",
"controls-state-change-of", "controls-production-of", "catalysis-precedes",
"controls-transport-of", "controls-transport-of-chemical", "chemical-affects",
"consumption-controlled-by","used-to-produce", "reacts-with"]
# empty list where whole database will be loaded
pathwaydata = []
with contextlib.closing(urllib.request.urlopen(url=url)) as rd: # open url
gzip_fd = gzip.GzipFile(fileobj=rd) # extract zip file
for df in pd.read_csv(gzip_fd, chunksize=1000,sep="\t"): # load 1000 entries of PathwayCommons (chunksize=1000)
data=df.loc[ df['INTERACTION_TYPE'].isin(interactions)] # get only those rows where type of interaction is of this listed in list interactions
data = data.drop(columns = "MEDIATOR_IDS") # drop "MEDIATOR_IDS" column as this column is not necessary
# <here further process the data>
data= data.values.tolist() # convert dataframe to list
data = [item for sublist in data for item in sublist]
pathwaydata.append(data) # add the data to pathwaydata list
pathwaydata= [item for sublist in pathwaydata for item in sublist] # make the list flat
pathwaydata = [pathwaydata[i:i+6] for i in range(0, len(pathwaydata), 6)] # create nested list
col_names = ["PARTICIPANT_A","INTERACTION_TYPE","PARTICIPANT_B","INTERACTION_PUBMED_ID","PATHWAY_NAMES"] # add column names
pathwaydata = pd.DataFrame(pathwaydata, columns=col_names) # convert the data to pandas dataframe
pathwaydata.to_csv(filename) # save data
return "data downloaded"
if name == 'main': # download PathwayCommons database result=DownloadAndProcessPathwayCommonsData(filename="C:\Ruby\data\PathwayCommons.csv") print(result)
HI NEED HELP FROM LINE 36-42 CODE CHANGES In the above script how can I do changes to get the data in the form of csv file from online databases specifically in "INTERACTION_TYPE" to get data from only signaling pathways not from protein-protein interaction pathway. Can anybody help me? Thanks in advance
Aucun commentaire:
Enregistrer un commentaire