PubmedSpreadsheet Generation Code

From GersteinInfo
Jump to navigationJump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

parse_pmids.py


#!/usr/bin/python
import os, sys
from GoogleSpreadsheet import GoogleSpreadsheet
from datetime import datetime
master_spreadsheet_id = "******************" worksheet_id = "od6" master_spreadsheet = GoogleSpreadsheet(master_spreadsheet_id, worksheet_id)
ncbiquery = "/home/mpw6/new_papers/ncbiquery.txt" ncbiFile = open(ncbiquery,'w')
def buildQuery(master_spreadsheet, ncbiFile): start = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" pmids = end = "&rettype=xml&retmode=file" for row in master_spreadsheet: if row['pmid']: pmids += row['pmid'].lstrip('\) + ',' pmids = pmids[:-1] out = start + pmids + end + '\n' ncbiFile.write(out)
buildQuery(master_spreadsheet, ncbiFile)

PubmedHandler.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import date
from xml.sax import ContentHandler
import xml.sax.saxutils
class PubmedHandler(ContentHandler): def __init__(self,output_file = 'export.tab'): self.file = file(output_file,'w') self.sep = '\t' self.data = {} self.key = self.pmid = False self.authors = [] self.handling = False self.elements = ['Initials', #multiple, name unique 'LastName', #multiple, name unique 'MedlineTA', #unique 'PMID', #unique 'OtherID', #unique 'MedlinePgn', #unique 'Volume', #unique 'Issue', #unique 'Year', #inside PubDate 'Month', #inside PubDate 'Day', #inside PubDate 'ArticleTitle', #unique 'CollectiveName',#unique 'ArticleId', 'PubDate', ] self.file.write(self.get_header())
def __del__(self): self.file.close()
def startElement(self,name,attrs): if name == 'PubmedArticle': self.data = {'PMID':} self.authors = [] self.is_articledate = False self.is_pubdate = False self.pmid = 0 self.handling = False elif name == 'PubDate': self.is_pubdate = True self.data['Year'] = self.data['Month']= self.data['Day'] = elif name in self.elements: if name == 'ArticleId': if attrs.getValue('IdType') == 'pmc': self.key = 'PMCID' self.data['PMCID'] = self.handling = True else: self.key = name self.handling = True if name == 'PMID': self.pmid += 1 if name not in ['Year','Month','Day','PMID']: self.data[name] =
def endElement(self,name): if name == 'PubmedArticle': self.write_to_file() elif name == 'PubDate': self.is_pubdate = False elif name == 'Author': #merge author if self.get_data('Initials'): self.authors.append(self.get_data('Initials') + ' ' + self.get_data('LastName')) if self.get_data('CollectiveName'): self.authors.append(self.get_data('CollectiveName')) self.data['CollectiveName'] = self.handling = False
def characters(self,content): if self.handling: if self.key in ['Year','Month','Day']: if self.is_pubdate: self.data[self.key] += content elif self.key == 'PMID': if self.pmid == 1: self.data[self.key] += content else: self.data[self.key] += xml.sax.saxutils.escape(content)
def write_to_file(self): try: self.file.write(self.get_row().encode('utf-8')) except UnicodeEncodeError as anomerr: print anomerr print self.get_row() return[]
def get_row(self): return self.sep.join([', '.join(self.authors), self.get_data('MedlineTA'), self.get_data('PMID'), self.get_data('MedlinePgn'), self.get_data('Volume'), self.get_data('Year'), self.get_citation(), self.get_data('ArticleTitle'), self.get_data('PMCID')]) + '\n'
def get_citation(self): citation = if self.get_data('Year'): citation += '(' + self.get_data('Year') + ').' + ' ' citation += '' + self.get_data('MedlineTA') + '' + ' ' date_str = ref_str = #build date string if self.get_data('Year'): date_str += self.get_data('Year') if self.get_data('Month'): date_str += ' ' + self.get_data('Month') if self.get_int_data('Day'): date_str += ' %d' % self.get_int_data('Day') date_str = date_str.strip() #build ref string if self.get_data('Volume'): ref_str += self.get_data('Volume') #if self.get_data('Issue'): # ref_str += '('+self.get_data('Issue')+')' if self.get_data('MedlinePgn'): ref_str += ':' + self.get_data('MedlinePgn') ref_str = ref_str.strip() #if date_str: #citation += date_str if ref_str: #if date_str: #citation += ';' citation += ref_str if citation[-1] != '.': citation += '.' return citation def get_data(self,key): if self.data.has_key(key): return self.data[key] return def get_int_data(self,key): val = self.get_data(key) try: val = int(val) return val except ValueError: pass return 0 def get_header(self): return self.sep.join(['Authors','Journal','PMID','Pages','Volume','Year','Citation','Title','PMCID']) + '\n'

import.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
from PubmedHandler import PubmedHandler
from xml import sax
import sys
import os
def main(input_file,output_file) :
   parser = sax.make_parser()
   handler = PubmedHandler(output_file)
   parser.setContentHandler(handler)
   parser.parse(input_file)
 
 if __name__ == '__main__':
   input_file = 'NCBIData.xml'
   output_file = 'export.tab'
   if len(sys.argv) > 1:
       input_file = sys.argv[1]
   if len(sys.argv) > 2:
       output_file = sys.argv[2]
   main(input_file,output_file)
   cmd = "sed -e 's/ä/\ä/g' -e 's/ç/\ç/g' -e 's/ü/\ü/g' -e 's/ó/\ó/g' -e 's/ö/\ö/g' -e 's/ş/\ş/g' -e 's/í/\í/g' -e 's/é/\é/g' -e 's/ô/\ô/g' -e 's/è/\è/g' export.tab > export_out.tab"
   os.system(cmd)

NewGoogleSpreadSheet.py

#!/usr/bin/python
import gdata.spreadsheet.service
class GoogleSpreadsheetException(Exception):

pass

class SpreadsheetRow:

"""Row in a GoogleSpreadsheet"""

def __init__(self, index, row_dict): """Inits SpreadsheetRow.

Args: index: Row position in spreadsheet. row_dict: Dict with the data for the row in form {'header1' : value1, 'header2' : value2, 'header3' : value3, ...} """ self.index = index self.row_dict = row_dict

def __str__(self): return str(self.row_dict)

class GoogleSpreadsheet:

"""An iterable Google Spreadsheet."""

def __init__(self, spreadsheet_id, worksheet_id, user='*************', password='**************'): """Inits GoogleSpreadsheet.

Args: spreadsheet_id: Google-assigned unique id for spreadsheet worksheet_id: Google-assigned unique id for worksheet within spreadsheet user: Google account id with access to spreadsheet password: Password for Google account """ self.gd_client = gdata.spreadsheet.service.SpreadsheetsService() self.gd_client.email = user self.gd_client.password = password self.gd_client.ProgrammaticLogin()

self.count = 0 self.spreadsheet_id = spreadsheet_id self.worksheet_id = worksheet_id self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id) self.rows = self.form_rows(self.feed)

def form_rows(self, ListFeed): """Builds row data from the Google Spreadsheet.

Args: listFeed: SpreadsheetsListFeed object for the spreadsheet """ rows = [] for i, entry in enumerate(ListFeed.entry): d = {} for key in entry.custom: d[key] = entry.custom[key].text rows.append(SpreadsheetRow(i, d)) return rows

def reload(self): """Refreshes object with current state of the remote spreadsheet""" self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id) self.rows = self.form_rows(self.feed)

def update_row(self, row): """Replaces a row on the remote spreadsheet and updates current object.

Args: row: SpreadsheetRow object to be updated """ entry = self.gd_client.UpdateRow(self.feed.entry[row.index], row.row_dict) if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList): raise GoogleSpreadsheetException("Error updating row") self.reload()

def add_row(self, row): """Adds an additional row to the remote spreadsheet and updates current object.

Args: row: SpreadsheetRow object to be added """ entry = self.gd_client.InsertRow(row.row_dict, self.spreadsheet_id, self.worksheet_id) if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList): raise GoogleSpreadsheetException("Error updating row") self.reload()

def delete_row(self, row): """Deletes a row from the remote spreadsheet and updates current object.

Args: row: SpreadsheetRow object to be deleted """ entry = self.gd_client.DeleteRow(self.feed.entry[row.index]) self.reload()

def __iter__(self): return self

def next(self): if self.count >= len(self.rows): self.count = 0 raise StopIteration else: self.count += 1 return self.rows[self.count - 1]

def __getitem__(self, index): return self.rows[index]

def __len__(self): return len(self.rows)