PubmedSpreadsheet Generation Code

From GersteinInfo

(Difference between revisions)
Jump to: navigation, search
(reload.py)
 
(19 intermediate revisions not shown)
Line 1: Line 1:
-
parse_pmids.py:
+
==parse_pmids.py==
-
+
  <code>
  <code>
  #!/usr/bin/python
  #!/usr/bin/python
Line 6: Line 5:
  from GoogleSpreadsheet import GoogleSpreadsheet
  from GoogleSpreadsheet import GoogleSpreadsheet
  from datetime import datetime<br \>
  from datetime import datetime<br \>
-
  master_spreadsheet_id = "thsIyYg12E8Px0zGJQsAopg"
+
  master_spreadsheet_id = "******************"
  worksheet_id = "od6"
  worksheet_id = "od6"
  master_spreadsheet = GoogleSpreadsheet(master_spreadsheet_id, worksheet_id)<br \>
  master_spreadsheet = GoogleSpreadsheet(master_spreadsheet_id, worksheet_id)<br \>
Line 23: Line 22:
  buildQuery(master_spreadsheet, ncbiFile)
  buildQuery(master_spreadsheet, ncbiFile)
  </code>
  </code>
 +
 +
==PubmedHandler.py==
 +
#!/usr/bin/python
 +
# -*- coding: utf-8 -*-
 +
from datetime import date
 +
from xml.sax import ContentHandler
 +
import xml.sax.saxutils<br \>
 +
class PubmedHandler(ContentHandler):
 +
    def __init__(self,output_file = 'export.tab'):
 +
        self.file = file(output_file,'w')
 +
        self.sep = '\t'
 +
        self.data = {}
 +
        self.key = ''
 +
        self.pmid = False
 +
        self.authors = []
 +
        self.handling = False
 +
        self.elements = ['Initials',    #multiple, name unique
 +
                        'LastName',    #multiple, name unique
 +
                        'MedlineTA',  #unique
 +
                        'PMID',        #unique
 +
                        'OtherID',    #unique
 +
                        'MedlinePgn',  #unique
 +
                        'Volume',      #unique
 +
                        'Issue',      #unique
 +
                        'Year',        #inside PubDate
 +
                        'Month',      #inside PubDate
 +
                        'Day',        #inside PubDate
 +
                        'ArticleTitle', #unique
 +
                        'CollectiveName',#unique
 +
                        'ArticleId',
 +
                        'PubDate',
 +
                        ]
 +
        self.file.write(self.get_header())<br \>   
 +
    def __del__(self):
 +
        self.file.close()<br \>
 +
    def startElement(self,name,attrs):
 +
        if name == 'PubmedArticle':
 +
            self.data = {'PMID':''}
 +
            self.authors = []
 +
            self.is_articledate = False
 +
            self.is_pubdate = False
 +
            self.pmid = 0
 +
            self.handling = False
 +
      elif name == 'PubDate':
 +
        self.is_pubdate = True
 +
        self.data['Year'] = ''
 +
        self.data['Month']= ''
 +
        self.data['Day'] = ''               
 +
        elif name in self.elements:
 +
            if name == 'ArticleId':
 +
                if attrs.getValue('IdType') == 'pmc':
 +
                    self.key = 'PMCID'
 +
                    self.data['PMCID'] = ''
 +
                    self.handling = True
 +
            else:   
 +
                self.key = name
 +
                self.handling = True
 +
            if name == 'PMID':
 +
                self.pmid += 1
 +
            if name not in ['Year','Month','Day','PMID']:
 +
                self.data[name] = ''<br \>         
 +
    def endElement(self,name):     
 +
        if name == 'PubmedArticle':
 +
            self.write_to_file()
 +
        elif name == 'PubDate':
 +
            self.is_pubdate = False
 +
        elif name == 'Author': #merge author
 +
            if self.get_data('Initials'):
 +
                self.authors.append(self.get_data('Initials') + ' ' + self.get_data('LastName'))
 +
            if self.get_data('CollectiveName'):
 +
                self.authors.append(self.get_data('CollectiveName'))
 +
            self.data['CollectiveName'] = ''
 +
        self.handling = False<br \>       
 +
    def characters(self,content):     
 +
        if self.handling:
 +
            if self.key in ['Year','Month','Day']:
 +
                if self.is_pubdate:
 +
                    self.data[self.key] += content
 +
            elif self.key == 'PMID':
 +
                if self.pmid == 1:
 +
                    self.data[self.key] += content
 +
            else:
 +
                self.data[self.key] += xml.sax.saxutils.escape(content)<br \>
 +
        def write_to_file(self):
 +
        try:
 +
            self.file.write(self.get_row().encode('utf-8'))
 +
        except UnicodeEncodeError as anomerr:
 +
            print anomerr
 +
            print self.get_row()
 +
            return[]<br \>
 +
      def get_row(self):
 +
        return self.sep.join([', '.join(self.authors),
 +
                self.get_data('MedlineTA'),
 +
                self.get_data('PMID'),
 +
                self.get_data('MedlinePgn'),
 +
                self.get_data('Volume'),
 +
                self.get_data('Year'),
 +
                self.get_citation(),
 +
                self.get_data('ArticleTitle'),
 +
                self.get_data('PMCID')]) + '\n'<br \>
 +
        def get_citation(self):
 +
        citation = ''
 +
        if self.get_data('Year'):
 +
          citation += '(' + self.get_data('Year') + ').' + ' '
 +
          citation += '<i>' + self.get_data('MedlineTA') + '</i>' + '&nbsp;'
 +
        date_str = ''
 +
        ref_str = ''
 +
        #build date string
 +
        if self.get_data('Year'):
 +
            date_str += self.get_data('Year')
 +
        if self.get_data('Month'):
 +
            date_str += ' ' + self.get_data('Month')
 +
        if self.get_int_data('Day'):
 +
            date_str += ' %d' % self.get_int_data('Day')
 +
        date_str = date_str.strip()
 +
       
 +
        #build ref string
 +
        if self.get_data('Volume'):
 +
            ref_str += self.get_data('Volume')
 +
        #if self.get_data('Issue'):
 +
        #    ref_str += '('+self.get_data('Issue')+')'
 +
        if self.get_data('MedlinePgn'):
 +
            ref_str += ':' + self.get_data('MedlinePgn')
 +
           
 +
        ref_str = ref_str.strip()
 +
       
 +
        #if date_str:
 +
            #citation += date_str
 +
        if ref_str:
 +
            #if date_str:
 +
                #citation += ';'
 +
            citation += ref_str
 +
        if citation[-1] != '.':
 +
            citation += '.'
 +
        return citation
 +
               
 +
    def get_data(self,key):
 +
        if self.data.has_key(key):
 +
            return self.data[key]
 +
        return ''
 +
   
 +
    def get_int_data(self,key):
 +
        val = self.get_data(key)
 +
        try:
 +
            val = int(val)
 +
            return val
 +
        except ValueError:
 +
            pass
 +
        return 0
 +
       
 +
    def get_header(self):
 +
        return self.sep.join(['Authors','Journal','PMID','Pages','Volume','Year','Citation','Title','PMCID']) + '\n'
 +
 +
==import.py==
 +
#!/usr/bin/python
 +
# -*- coding: utf-8 -*-
 +
from PubmedHandler import PubmedHandler
 +
from xml import sax
 +
import sys
 +
import os
 +
def main(input_file,output_file) :
 +
    parser = sax.make_parser()
 +
    handler = PubmedHandler(output_file)
 +
    parser.setContentHandler(handler)
 +
    parser.parse(input_file)
 +
 
 +
  if __name__ == '__main__':
 +
    input_file = 'NCBIData.xml'
 +
    output_file = 'export.tab'
 +
    if len(sys.argv) > 1:
 +
        input_file = sys.argv[1]
 +
    if len(sys.argv) > 2:
 +
        output_file = sys.argv[2]
 +
    main(input_file,output_file)
 +
    cmd = "sed -e 's/ä/\&#228;/g' -e 's/ç/\&#231;/g' -e 's/ü/\&#252;/g' -e 's/ó/\&#243;/g' -e 's/ö/\&#246;/g' -e 's/ş/\&#351;/g' -e 's/í/\&#237;/g' -e 's/é/\&#233;/g' -e 's/ô/\&#244;/g' -e 's/è/\&#232;/g' export.tab > export_out.tab"
 +
    os.system(cmd)
 +
 +
==NewGoogleSpreadSheet.py==
 +
<code><pre>
 +
#!/usr/bin/python
 +
import gdata.spreadsheet.service
 +
class GoogleSpreadsheetException(Exception):
 +
    pass
 +
class SpreadsheetRow:
 +
    """Row in a GoogleSpreadsheet"""
 +
        def __init__(self, index, row_dict):
 +
"""Inits SpreadsheetRow.
 +
 +
Args:
 +
index:  Row position in spreadsheet.
 +
row_dict:  Dict with the data for the row in form
 +
{'header1' : value1, 'header2' : value2, 'header3' : value3, ...}
 +
"""
 +
self.index = index
 +
self.row_dict = row_dict
 +
 +
def __str__(self):
 +
return str(self.row_dict)
 +
 +
class GoogleSpreadsheet:
 +
"""An iterable Google Spreadsheet."""
 +
 +
def __init__(self, spreadsheet_id, worksheet_id, user='*************', password='**************'):
 +
"""Inits GoogleSpreadsheet.
 +
 +
Args:
 +
spreadsheet_id: Google-assigned unique id for spreadsheet
 +
worksheet_id: Google-assigned unique id for worksheet within spreadsheet
 +
user: Google account id with access to spreadsheet
 +
password: Password for Google account
 +
"""
 +
self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
 +
self.gd_client.email = user
 +
self.gd_client.password = password
 +
self.gd_client.ProgrammaticLogin()
 +
 +
self.count = 0
 +
self.spreadsheet_id = spreadsheet_id
 +
self.worksheet_id = worksheet_id
 +
self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id)
 +
self.rows = self.form_rows(self.feed)
 +
 +
def form_rows(self, ListFeed):
 +
"""Builds row data from the Google Spreadsheet.
 +
 +
Args:
 +
listFeed: SpreadsheetsListFeed object for the spreadsheet
 +
"""
 +
rows = []
 +
for i, entry in enumerate(ListFeed.entry):
 +
d = {}
 +
for key in entry.custom:
 +
d[key] = entry.custom[key].text
 +
rows.append(SpreadsheetRow(i, d))
 +
return rows
 +
 +
def reload(self):
 +
"""Refreshes object with current state of the remote spreadsheet"""
 +
self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id)
 +
self.rows = self.form_rows(self.feed)
 +
 +
def update_row(self, row):
 +
"""Replaces a row on the remote spreadsheet and updates current object.
 +
 +
Args:
 +
row: SpreadsheetRow object to be updated
 +
"""
 +
entry = self.gd_client.UpdateRow(self.feed.entry[row.index], row.row_dict)
 +
if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
 +
raise GoogleSpreadsheetException("Error updating row")
 +
self.reload()
 +
 +
def add_row(self, row):
 +
"""Adds an additional row to the remote spreadsheet and updates current
 +
object.
 +
 +
Args:
 +
row: SpreadsheetRow object to be added
 +
"""
 +
entry = self.gd_client.InsertRow(row.row_dict, self.spreadsheet_id, self.worksheet_id)
 +
if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
 +
raise GoogleSpreadsheetException("Error updating row")
 +
self.reload()
 +
 +
def delete_row(self, row):
 +
"""Deletes a row from the remote spreadsheet and updates current object.
 +
 +
Args:
 +
row: SpreadsheetRow object to be deleted
 +
"""
 +
entry = self.gd_client.DeleteRow(self.feed.entry[row.index])
 +
self.reload()
 +
 +
def __iter__(self):
 +
return self
 +
 +
def next(self):
 +
if self.count >= len(self.rows):
 +
self.count = 0
 +
raise StopIteration
 +
else:
 +
self.count += 1
 +
return self.rows[self.count - 1]
 +
 +
def __getitem__(self, index):
 +
return self.rows[index]
 +
 +
def __len__(self):
 +
return len(self.rows)
 +
</pre></code>
 +
==reload.py==
 +
<code><pre>
 +
#!/usr/bin/python
 +
from NEWGoogleSpreadsheet import GoogleSpreadsheet, SpreadsheetRow
 +
 +
spreadsheet_id = '0AiiHlTECOi8edHFxQWRwN0kxeTQ3ZzdCOXhQX2Z1ZGc'
 +
worksheet_id = 'od6'
 +
 +
gs = GoogleSpreadsheet(spreadsheet_id, worksheet_id)
 +
f = open('export_out.tab', 'r')
 +
 +
headers = [str(x).lower() for x in f.readline().rstrip('\n').split('\t')]
 +
total_lines = 0
 +
for line_num, line in enumerate(f):
 +
        total_lines += 1
 +
        fields = line.rstrip('\n').split('\t')
 +
        row_dict = {}
 +
        for i, val in enumerate(fields):
 +
                row_dict[headers[i]] = val
 +
        r = SpreadsheetRow(line_num, row_dict)
 +
        print r.index, r.row_dict
 +
        if line_num < len(gs):
 +
                gs.update_row(r)
 +
        else:
 +
                gs.add_row(r)
 +
# Remove excess
 +
if total_lines < len(gs):
 +
        for r in gs[:total_lines-1:-1]:
 +
                gs.delete_row(r)
 +
</pre></code>

Latest revision as of 10:53, 19 September 2011

Contents

parse_pmids.py


#!/usr/bin/python
import os, sys
from GoogleSpreadsheet import GoogleSpreadsheet
from datetime import datetime
master_spreadsheet_id = "******************" worksheet_id = "od6" master_spreadsheet = GoogleSpreadsheet(master_spreadsheet_id, worksheet_id)
ncbiquery = "/home/mpw6/new_papers/ncbiquery.txt" ncbiFile = open(ncbiquery,'w')
def buildQuery(master_spreadsheet, ncbiFile): start = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" pmids = end = "&rettype=xml&retmode=file" for row in master_spreadsheet: if row['pmid']: pmids += row['pmid'].lstrip('\) + ',' pmids = pmids[:-1] out = start + pmids + end + '\n' ncbiFile.write(out)
buildQuery(master_spreadsheet, ncbiFile)

PubmedHandler.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import date
from xml.sax import ContentHandler
import xml.sax.saxutils
class PubmedHandler(ContentHandler): def __init__(self,output_file = 'export.tab'): self.file = file(output_file,'w') self.sep = '\t' self.data = {} self.key = self.pmid = False self.authors = [] self.handling = False self.elements = ['Initials', #multiple, name unique 'LastName', #multiple, name unique 'MedlineTA', #unique 'PMID', #unique 'OtherID', #unique 'MedlinePgn', #unique 'Volume', #unique 'Issue', #unique 'Year', #inside PubDate 'Month', #inside PubDate 'Day', #inside PubDate 'ArticleTitle', #unique 'CollectiveName',#unique 'ArticleId', 'PubDate', ] self.file.write(self.get_header())
def __del__(self): self.file.close()
def startElement(self,name,attrs): if name == 'PubmedArticle': self.data = {'PMID':} self.authors = [] self.is_articledate = False self.is_pubdate = False self.pmid = 0 self.handling = False elif name == 'PubDate': self.is_pubdate = True self.data['Year'] = self.data['Month']= self.data['Day'] = elif name in self.elements: if name == 'ArticleId': if attrs.getValue('IdType') == 'pmc': self.key = 'PMCID' self.data['PMCID'] = self.handling = True else: self.key = name self.handling = True if name == 'PMID': self.pmid += 1 if name not in ['Year','Month','Day','PMID']: self.data[name] =
def endElement(self,name): if name == 'PubmedArticle': self.write_to_file() elif name == 'PubDate': self.is_pubdate = False elif name == 'Author': #merge author if self.get_data('Initials'): self.authors.append(self.get_data('Initials') + ' ' + self.get_data('LastName')) if self.get_data('CollectiveName'): self.authors.append(self.get_data('CollectiveName')) self.data['CollectiveName'] = self.handling = False
def characters(self,content): if self.handling: if self.key in ['Year','Month','Day']: if self.is_pubdate: self.data[self.key] += content elif self.key == 'PMID': if self.pmid == 1: self.data[self.key] += content else: self.data[self.key] += xml.sax.saxutils.escape(content)
def write_to_file(self): try: self.file.write(self.get_row().encode('utf-8')) except UnicodeEncodeError as anomerr: print anomerr print self.get_row() return[]
def get_row(self): return self.sep.join([', '.join(self.authors), self.get_data('MedlineTA'), self.get_data('PMID'), self.get_data('MedlinePgn'), self.get_data('Volume'), self.get_data('Year'), self.get_citation(), self.get_data('ArticleTitle'), self.get_data('PMCID')]) + '\n'
def get_citation(self): citation = if self.get_data('Year'): citation += '(' + self.get_data('Year') + ').' + ' ' citation += '' + self.get_data('MedlineTA') + '' + ' ' date_str = ref_str = #build date string if self.get_data('Year'): date_str += self.get_data('Year') if self.get_data('Month'): date_str += ' ' + self.get_data('Month') if self.get_int_data('Day'): date_str += ' %d' % self.get_int_data('Day') date_str = date_str.strip() #build ref string if self.get_data('Volume'): ref_str += self.get_data('Volume') #if self.get_data('Issue'): # ref_str += '('+self.get_data('Issue')+')' if self.get_data('MedlinePgn'): ref_str += ':' + self.get_data('MedlinePgn') ref_str = ref_str.strip() #if date_str: #citation += date_str if ref_str: #if date_str: #citation += ';' citation += ref_str if citation[-1] != '.': citation += '.' return citation def get_data(self,key): if self.data.has_key(key): return self.data[key] return def get_int_data(self,key): val = self.get_data(key) try: val = int(val) return val except ValueError: pass return 0 def get_header(self): return self.sep.join(['Authors','Journal','PMID','Pages','Volume','Year','Citation','Title','PMCID']) + '\n'

import.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
from PubmedHandler import PubmedHandler
from xml import sax
import sys
import os
def main(input_file,output_file) :
   parser = sax.make_parser()
   handler = PubmedHandler(output_file)
   parser.setContentHandler(handler)
   parser.parse(input_file)
 
 if __name__ == '__main__':
   input_file = 'NCBIData.xml'
   output_file = 'export.tab'
   if len(sys.argv) > 1:
       input_file = sys.argv[1]
   if len(sys.argv) > 2:
       output_file = sys.argv[2]
   main(input_file,output_file)
   cmd = "sed -e 's/ä/\ä/g' -e 's/ç/\ç/g' -e 's/ü/\ü/g' -e 's/ó/\ó/g' -e 's/ö/\ö/g' -e 's/ş/\ş/g' -e 's/í/\í/g' -e 's/é/\é/g' -e 's/ô/\ô/g' -e 's/è/\è/g' export.tab > export_out.tab"
   os.system(cmd)

NewGoogleSpreadSheet.py

 
#!/usr/bin/python
 import gdata.spreadsheet.service
 class GoogleSpreadsheetException(Exception):
     pass
 class SpreadsheetRow:
     """Row in a GoogleSpreadsheet"""
        def __init__(self, index, row_dict):
		"""Inits SpreadsheetRow.
	
		Args:
			index:  Row position in spreadsheet.
			row_dict:  Dict with the data for the row in form 
			{'header1' : value1, 'header2' : value2, 'header3' : value3, ...}
		"""
		self.index = index
		self.row_dict = row_dict
		
	def __str__(self):
		return str(self.row_dict)
		
 class GoogleSpreadsheet:
	"""An iterable Google Spreadsheet."""

	def __init__(self, spreadsheet_id, worksheet_id, user='*************', password='**************'):
		"""Inits GoogleSpreadsheet.
	
		Args:
			spreadsheet_id: Google-assigned unique id for spreadsheet
			worksheet_id: Google-assigned unique id for worksheet within spreadsheet
			user: Google account id with access to spreadsheet
			password: Password for Google account
		"""
		self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
		self.gd_client.email = user
		self.gd_client.password = password
		self.gd_client.ProgrammaticLogin()
	
		self.count = 0
		self.spreadsheet_id = spreadsheet_id
		self.worksheet_id = worksheet_id
		self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id)
		self.rows = self.form_rows(self.feed)
	
	def form_rows(self, ListFeed):
		"""Builds row data from the Google Spreadsheet.
	
		Args:
			listFeed: SpreadsheetsListFeed object for the spreadsheet
		"""
		rows = []
		for i, entry in enumerate(ListFeed.entry):
			d = {}
			for key in entry.custom:
				d[key] = entry.custom[key].text
			rows.append(SpreadsheetRow(i, d))
		return rows

	def reload(self):
		"""Refreshes object with current state of the remote spreadsheet"""
		self.feed = self.gd_client.GetListFeed(self.spreadsheet_id, self.worksheet_id)
		self.rows = self.form_rows(self.feed)
	
	def update_row(self, row):
		"""Replaces a row on the remote spreadsheet and updates current object.
	
		Args:
			row: SpreadsheetRow object to be updated
		"""
		entry = self.gd_client.UpdateRow(self.feed.entry[row.index], row.row_dict)
		if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
			raise GoogleSpreadsheetException("Error updating row")
		self.reload()

	def add_row(self, row):
		"""Adds an additional row to the remote spreadsheet and updates current
		object.

		Args:
			row: SpreadsheetRow object to be added
		"""
		entry = self.gd_client.InsertRow(row.row_dict, self.spreadsheet_id, self.worksheet_id)
		if not isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
			raise GoogleSpreadsheetException("Error updating row")
		self.reload()

	def delete_row(self, row):
		"""Deletes a row from the remote spreadsheet and updates current object.

		Args:
			row: SpreadsheetRow object to be deleted
		"""
		entry = self.gd_client.DeleteRow(self.feed.entry[row.index])
		self.reload()
							
	def __iter__(self):
		return self

	def next(self):
		if self.count >= len(self.rows):
			self.count = 0
			raise StopIteration
		else:
			self.count += 1
			return self.rows[self.count - 1]

	def __getitem__(self, index):
		return self.rows[index]
	
	def __len__(self):
		return len(self.rows)

reload.py

#!/usr/bin/python
from NEWGoogleSpreadsheet import GoogleSpreadsheet, SpreadsheetRow

spreadsheet_id = '0AiiHlTECOi8edHFxQWRwN0kxeTQ3ZzdCOXhQX2Z1ZGc'
worksheet_id = 'od6'

gs = GoogleSpreadsheet(spreadsheet_id, worksheet_id)
f = open('export_out.tab', 'r')

headers = [str(x).lower() for x in f.readline().rstrip('\n').split('\t')]
total_lines = 0
for line_num, line in enumerate(f):
        total_lines += 1
        fields = line.rstrip('\n').split('\t')
        row_dict = {}
        for i, val in enumerate(fields):
                row_dict[headers[i]] = val
        r = SpreadsheetRow(line_num, row_dict)
        print r.index, r.row_dict
        if line_num < len(gs):
                gs.update_row(r)
        else:
                gs.add_row(r)
# Remove excess
if total_lines < len(gs):
        for r in gs[:total_lines-1:-1]:
                gs.delete_row(r)
Personal tools