#! /usr/bin/env python # importdata2patents.py # # Author: Pedro Hernandez. May, 2008. v1.0 # import dbi, odbc, httplib, mimetypes, sys # This function provides the last id of record of the patents table def get_last_p_id(): sql_st = "SELECT MAX(p_id) FROM patents" cursor.execute(sql_st) last_p_id = cursor.fetchall() try: next_p_id = int(last_p_id[0][0]) + 1 except: next_p_id = 1; return next_p_id # This function provides the last id of record of the patent_query table def get_last_rec_id(): sql_st = "SELECT MAX(recid) FROM patent_query" cursor.execute(sql_st) next_rec_id = cursor.fetchall() try: next_rec_id = int(next_rec_id[0][0]) + 1 except: next_rec_id = 1; return next_rec_id # We check that the string has no simple quotation marks and subst them def check_quot(s_chain): if s_chain.count("'") > 0: for idx in range (0,len(s_chain)): if s_chain[idx] == "'": s_chain = s_chain[0:idx] + "*" + s_chain[idx+1:len(s_chain)] return s_chain # This is the delimiter that will be used in the final file. You can use # either a semicolon ";" or something like that. delimiter = "|" # Get the filename to import the data from if sys.argv[1:] == []: print "No file supplied to import in the database" exit(1) filenamein = sys.argv[1] # Data for the databasse connection DSN = "PostgreSQL30" dbuser = "postgres" dbpass = "123456" # Open the connection conn = odbc.odbc(DSN+"/"+dbuser+"/"+dbpass) #Get the cursor cursor = conn.cursor() # Execute a first statement to get the last patent_id next_p_id = get_last_p_id() # Now we open the delimiter separated file in read mode fin = open(filenamein, 'r') # Find the columns index from the titles title_line = fin.readline() rowdata = title_line.split(delimiter) query_idx = rowdata.index("Query Params") pat_idx = rowdata.index("Record Id.") code_idx = rowdata.index("Patent Code") pubdate_idx = rowdata.index("Publication Date") desc_idx = rowdata.index("Description") intcl_idx = rowdata.index("International Class") appnum_idx = rowdata.index("Application Number") appname_idx = rowdata.index("Applicant Name") url_idx = rowdata.index("url") abst_idx = rowdata.index("Abstract\n") count_new_pat = 0 count_new_term = 0 # Now the actual data stored for line in fin: sql_st = "START TRANSACTION" cursor.execute(sql_st) #try: rowdata = line.split(delimiter) sql_st = "SELECT p_id FROM patents WHERE p_code='" + rowdata[code_idx] + "'" cursor.execute(sql_st) result = cursor.fetchall() if result == []: # This means that it has not been inserted in the patents table. We have to do it rowdata[desc_idx] = check_quot(rowdata[desc_idx]) rowdata[appname_idx] = check_quot(rowdata[appname_idx]) rowdata[abst_idx] = check_quot(rowdata[abst_idx]) rowdata[url_idx] = check_quot(rowdata[url_idx]) sql_st = "INSERT INTO patents (p_id, p_code, pub_dated, description, applicant, intl_class, appl_number, url, abstract) VALUES (" sql_st = sql_st + "'" + str(next_p_id)+"','"+rowdata[code_idx]+"',DATE('"+rowdata[pubdate_idx]+"'),'"+rowdata[desc_idx]+"','" sql_st = sql_st + rowdata[appname_idx]+"','"+rowdata[intcl_idx]+"','"+rowdata[appnum_idx]+"','"+rowdata[url_idx]+"','"+rowdata[abst_idx][0:(len(rowdata[abst_idx]) - 2)]+"');" cursor.execute(sql_st) count_new_pat = count_new_pat + 1 result = [[next_p_id,],] next_p_id = next_p_id + 1 # check ifthe query terms doesn't exist sql_st = "SELECT recid FROM patent_query WHERE pat_id='" + str(result[0][0]) + "' AND query_terms = '" + rowdata[query_idx] + "'" cursor.execute(sql_st) result_s = cursor.fetchall() if result_s == []: # This means that the record does not exist next_rec_id = get_last_rec_id() sql_st = "INSERT INTO patent_query (recid, pat_id, query_terms) VALUES ('" + str(next_rec_id) + "','" + str(result[0][0]) + "','" + rowdata[query_idx] + "');" cursor.execute(sql_st) count_new_term = count_new_term + 1 next_rec_id = next_rec_id + 1 sql_st = "COMMIT TRANSACTION" cursor.execute(sql_st) #except: # If an exception is produced in the process of all the inserts... #sql_st = "ROLLBACK TRANSACTION" #cursor.execute(sql_st) #print "Error in line :" + rowdata[pat_idx] #break fin.close() print str(count_new_term) + " new search term records added for " + str(count_new_pat) + " new patents found"