mardi 28 septembre 2021

INTERACTION_TYPE

  # load necessary packages

import pandas as pd import contextlib import urllib.request import gzip

def DownloadAndProcessPathwayCommonsData(filename="PathwayCommons.csv"): """Download PathwayCommons data from txt format - long .sif file (see details here: https://www.pathwaycommons.org/pc2/formats) The data are converted into DataFrame.

   read_csv method from pandas library is applied to load a small chunk of data (1000 rows of txt file). Next the data are filtered
   as follows:
   * only those edges of the network are taken which has in interaction_type column only those values listed in 'interactions' vector
   * column MEDIATOR_IDS is deleted  

Keyword arguments:
filename --- path to the file where data will be downloaded, as a default in the current directory under the name "PathwayCommons.csv"

Outputs:
result --- a text "data downloaded" which is printed when data are downloaded successfully
"""
# url where PathwayCommons is located
url = "https://www.pathwaycommons.org/archives/PC2/v12/PathwayCommons12.All.hgnc.txt.gz"
# list with all important interactions type
interactions = ["controls-expression-of","interacts-with", "controls-phosphorylation-of",
            "controls-state-change-of", "controls-production-of", "catalysis-precedes",
            "controls-transport-of", "controls-transport-of-chemical", "chemical-affects",
            "consumption-controlled-by","used-to-produce", "reacts-with"]
# empty list where whole database will be loaded
pathwaydata = []
with contextlib.closing(urllib.request.urlopen(url=url)) as rd: # open url
    gzip_fd = gzip.GzipFile(fileobj=rd) # extract zip file
    for df in pd.read_csv(gzip_fd, chunksize=1000,sep="\t"): # load 1000 entries of PathwayCommons (chunksize=1000)
        data=df.loc[ df['INTERACTION_TYPE'].isin(interactions)] # get only those rows where type of interaction is of this listed in list interactions
        data = data.drop(columns = "MEDIATOR_IDS") # drop "MEDIATOR_IDS" column as this column is not necessary
        # <here further process the data>
        data= data.values.tolist()  # convert dataframe to list
        data = [item for sublist in data for item in sublist]
        pathwaydata.append(data) # add the  data to pathwaydata list
pathwaydata= [item for sublist in pathwaydata for item in sublist] # make the list flat
pathwaydata = [pathwaydata[i:i+6] for i in range(0, len(pathwaydata), 6)]  # create nested list
col_names = ["PARTICIPANT_A","INTERACTION_TYPE","PARTICIPANT_B","INTERACTION_PUBMED_ID","PATHWAY_NAMES"]   # add column names
pathwaydata = pd.DataFrame(pathwaydata, columns=col_names)   # convert the data to pandas dataframe
pathwaydata.to_csv(filename)  # save data
return "data downloaded"

if name == 'main': # download PathwayCommons database result=DownloadAndProcessPathwayCommonsData(filename="C:\Ruby\data\PathwayCommons.csv") print(result)

HI NEED HELP FROM LINE 36-42 CODE CHANGES In the above script how can I do changes to get the data in the form of csv file from online databases specifically in "INTERACTION_TYPE" to get data from only signaling pathways not from protein-protein interaction pathway. Can anybody help me? Thanks in advance




Aucun commentaire:

Enregistrer un commentaire