From GersteinInfo
parse_pmids.py
#!/usr/bin/python
import os, sys
from GoogleSpreadsheet import GoogleSpreadsheet
from datetime import datetime
master_spreadsheet_id = "******************"
worksheet_id = "od6"
master_spreadsheet = GoogleSpreadsheet(master_spreadsheet_id, worksheet_id)
ncbiquery = "/home/mpw6/new_papers/ncbiquery.txt"
ncbiFile = open(ncbiquery,'w')
def buildQuery(master_spreadsheet, ncbiFile):
start = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="
pmids =
end = "&rettype=xml&retmode=file"
for row in master_spreadsheet:
if row['pmid']:
pmids += row['pmid'].lstrip('\) + ','
pmids = pmids[:-1]
out = start + pmids + end + '\n'
ncbiFile.write(out)
buildQuery(master_spreadsheet, ncbiFile)
PubmedHandler.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import date
from xml.sax import ContentHandler
import xml.sax.saxutils
class PubmedHandler(ContentHandler):
def __init__(self,output_file = 'export.tab'):
self.file = file(output_file,'w')
self.sep = '\t'
self.data = {}
self.key =
self.pmid = False
self.authors = []
self.handling = False
self.elements = ['Initials', #multiple, name unique
'LastName', #multiple, name unique
'MedlineTA', #unique
'PMID', #unique
'OtherID', #unique
'MedlinePgn', #unique
'Volume', #unique
'Issue', #unique
'Year', #inside PubDate
'Month', #inside PubDate
'Day', #inside PubDate
'ArticleTitle', #unique
'CollectiveName',#unique
'ArticleId',
'PubDate',
]
self.file.write(self.get_header())
def __del__(self):
self.file.close()
def startElement(self,name,attrs):
if name == 'PubmedArticle':
self.data = {'PMID':}
self.authors = []
self.is_articledate = False
self.is_pubdate = False
self.pmid = 0
self.handling = False
elif name == 'PubDate':
self.is_pubdate = True
self.data['Year'] =
self.data['Month']=
self.data['Day'] =
elif name in self.elements:
if name == 'ArticleId':
if attrs.getValue('IdType') == 'pmc':
self.key = 'PMCID'
self.data['PMCID'] =
self.handling = True
else:
self.key = name
self.handling = True
if name == 'PMID':
self.pmid += 1
if name not in ['Year','Month','Day','PMID']:
self.data[name] =
def endElement(self,name):
if name == 'PubmedArticle':
self.write_to_file()
elif name == 'PubDate':
self.is_pubdate = False
elif name == 'Author': #merge author
if self.get_data('Initials'):
self.authors.append(self.get_data('Initials') + ' ' + self.get_data('LastName'))
if self.get_data('CollectiveName'):
self.authors.append(self.get_data('CollectiveName'))
self.data['CollectiveName'] =
self.handling = False
def characters(self,content):
if self.handling:
if self.key in ['Year','Month','Day']:
if self.is_pubdate:
self.data[self.key] += content
elif self.key == 'PMID':
if self.pmid == 1:
self.data[self.key] += content
else:
self.data[self.key] += xml.sax.saxutils.escape(content)
def write_to_file(self):
try:
self.file.write(self.get_row().encode('utf-8'))
except UnicodeEncodeError as anomerr:
print anomerr
print self.get_row()
return[]
def get_row(self):
return self.sep.join([', '.join(self.authors),
self.get_data('MedlineTA'),
self.get_data('PMID'),
self.get_data('MedlinePgn'),
self.get_data('Volume'),
self.get_data('Year'),
self.get_citation(),
self.get_data('ArticleTitle'),
self.get_data('PMCID')]) + '\n'
def get_citation(self):
citation =
if self.get_data('Year'):
citation += '(' + self.get_data('Year') + ').' + ' '
citation += '' + self.get_data('MedlineTA') + '' + ' '
date_str =
ref_str =
#build date string
if self.get_data('Year'):
date_str += self.get_data('Year')
if self.get_data('Month'):
date_str += ' ' + self.get_data('Month')
if self.get_int_data('Day'):
date_str += ' %d' % self.get_int_data('Day')
date_str = date_str.strip()
#build ref string
if self.get_data('Volume'):
ref_str += self.get_data('Volume')
#if self.get_data('Issue'):
# ref_str += '('+self.get_data('Issue')+')'
if self.get_data('MedlinePgn'):
ref_str += ':' + self.get_data('MedlinePgn')
ref_str = ref_str.strip()
#if date_str:
#citation += date_str
if ref_str:
#if date_str:
#citation += ';'
citation += ref_str
if citation[-1] != '.':
citation += '.'
return citation
def get_data(self,key):
if self.data.has_key(key):
return self.data[key]
return
def get_int_data(self,key):
val = self.get_data(key)
try:
val = int(val)
return val
except ValueError:
pass
return 0
def get_header(self):
return self.sep.join(['Authors','Journal','PMID','Pages','Volume','Year','Citation','Title','PMCID']) + '\n'
import.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from PubmedHandler import PubmedHandler
from xml import sax
import sys
import os
def main(input_file,output_file) :
parser = sax.make_parser()
handler = PubmedHandler(output_file)
parser.setContentHandler(handler)
parser.parse(input_file)
if __name__ == '__main__':
input_file = 'NCBIData.xml'
output_file = 'export.tab'
if len(sys.argv) > 1:
input_file = sys.argv[1]
if len(sys.argv) > 2:
output_file = sys.argv[2]
main(input_file,output_file)
cmd = "sed -e 's/ä/\ä/g' -e 's/ç/\ç/g' -e 's/ü/\ü/g' -e 's/ó/\ó/g' -e 's/ö/\ö/g' -e 's/ş/\ş/g' -e 's/í/\í/g' -e 's/é/\é/g' -e 's/ô/\ô/g' -e 's/è/\è/g' export.tab > export_out.tab"
os.system(cmd)
NewGoogleSpreadSheet.py
#!/usr/bin/python
import gdata.spreadsheet.service
class GoogleSpreadsheetException(Exception):
pass
class SpreadsheetRow:
"""Row in a GoogleSpreadsheet"""
def __init__(self, index, row_dict):
"""Inits SpreadsheetRow.
Args:
index: Row position in spreadsheet.
row_dict: Dict with the data for the row in form
{'header1' : value1, 'header2' : value2, 'header3' : value3, ...}
"""
self.index = index
self.row_dict = row_dict
def __str__(self):
return str(self.row_dict)
class GoogleSpreadsheet:
"""An iterable Google Spreadsheet."""
def __init__(self, spreadsheet_id, worksheet_id, user='*************', password='**************'):
"""Inits GoogleSpreadsheet.
Args:
spreadsheet_id: Google-assigned unique id for spreadsheet
worksheet_id: Google-assigned unique id for worksheet within spreadsheet
user: Google account id with access to spreadsheet
password: Password for Google account
"""
self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
self.gd_client.email = user
self.gd_client.password = password
self.gd_client.ProgrammaticLogin()
self.count = 0
self.spreadsheet_id = spreadsheet_id
self.worksheet_id = worksheet_id
self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id)
self.rows = self.form_rows(self.feed)
def form_rows(self, ListFeed):
"""Builds row data from the Google Spreadsheet.
Args:
listFeed: SpreadsheetsListFeed object for the spreadsheet
"""
rows = []
for i, entry in enumerate(ListFeed.entry):
d = {}
for key in entry.custom:
d[key] = entry.custom[key].text
rows.append(SpreadsheetRow(i, d))
return rows
def reload(self):
"""Refreshes object with current state of the remote spreadsheet"""
self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id)
self.rows = self.form_rows(self.feed)
def update_row(self, row):
"""Replaces a row on the remote spreadsheet and updates current object.
Args:
row: SpreadsheetRow object to be updated
"""
entry = self.gd_client.UpdateRow(self.feed.entry[row.index], row.row_dict)
if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
raise GoogleSpreadsheetException("Error updating row")
self.reload()
def add_row(self, row):
"""Adds an additional row to the remote spreadsheet and updates current
object.
Args:
row: SpreadsheetRow object to be added
"""
entry = self.gd_client.InsertRow(row.row_dict, self.spreadsheet_id, self.worksheet_id)
if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
raise GoogleSpreadsheetException("Error updating row")
self.reload()
def delete_row(self, row):
"""Deletes a row from the remote spreadsheet and updates current object.
Args:
row: SpreadsheetRow object to be deleted
"""
entry = self.gd_client.DeleteRow(self.feed.entry[row.index])
self.reload()
def __iter__(self):
return self
def next(self):
if self.count >= len(self.rows):
self.count = 0
raise StopIteration
else:
self.count += 1
return self.rows[self.count - 1]
def __getitem__(self, index):
return self.rows[index]
def __len__(self):
return len(self.rows)
reload.py
- !/usr/bin/python
from NEWGoogleSpreadsheet import GoogleSpreadsheet, SpreadsheetRow
spreadsheet_id = '0AiiHlTECOi8edHFxQWRwN0kxeTQ3ZzdCOXhQX2Z1ZGc'
worksheet_id = 'od6'
gs = GoogleSpreadsheet(spreadsheet_id, worksheet_id)
f = open('export_out.tab', 'r')
headers = [str(x).lower() for x in f.readline().rstrip('\n').split('\t')]
total_lines = 0
for line_num, line in enumerate(f):
total_lines += 1
fields = line.rstrip('\n').split('\t')
row_dict = {}
for i, val in enumerate(fields):
row_dict[headers[i]] = val
r = SpreadsheetRow(line_num, row_dict)
print r.index, r.row_dict
if line_num < len(gs):
gs.update_row(r)
else:
gs.add_row(r)
# Remove excess
if total_lines < len(gs):
for r in gs[:total_lines-1:-1]:
gs.delete_row(r)