// drug_import.cpp : Defines the entry point for the console application. // #include "stdafx.h" using namespace std; // // progress - Courtesy of my good friend Lee Killough :P // void progress() { static unsigned int pcnt; if(!((++pcnt)&31)) printf("%c\b", "/-\\|"[((pcnt)>>5)&3]); } // // FatalError // // Post a message and exit // void FatalError(const char *str, ...) { va_list va; va_start(va, str); vfprintf(stderr, str, va); exit(1); va_end(va); } // // LoadTextFile // // Load a text file from disk. Returns null if fails. // char *LoadTextFile(const char *filename) { FILE *f; char *buffer; long size; if(!(f = fopen(filename, "rb"))) return NULL; fseek(f, 0, SEEK_END); size = ftell(f); fseek(f, 0, SEEK_SET); if(size <= 0) { fclose(f); return NULL; } buffer = new char [size + 1]; memset(buffer, 0, size+1); if(fread(buffer, 1, size, f) != size) { delete [] buffer; fclose(f); return NULL; } fclose(f); return buffer; } // // GetLineAsString // // Gets the next line of the input buffer as a string. // Returns false if input position is at the end of the input buffer. // bool GetLineAsString(const char **input, string &output) { const char *rover = *input; output.clear(); while(*rover != '\0') { if(*rover == '\n') { ++rover; break; } if(*rover != '\r') // no \r's, thanks. { if(*rover == '\'') output.append("''"); // escape single quotes else output.push_back(*rover); } ++rover; } // Advance the input position *input = rover; return (*rover != '\0'); } // // RStrip // // Remove whitespace from right-hand end of a string // string RStrip(string &str) { size_t last_nonspace = str.find_last_not_of(' '); if(last_nonspace == string::npos) // all spaces? return string(""); else if(last_nonspace != str.length() - 1) // has spaces at end? return str.substr(0, last_nonspace+1); else return str; } // // LStrip // // Remove whitespace from the left-hand end of a string // string LStrip(string &str) { size_t first_nonspace = str.find_first_not_of(' '); if(first_nonspace == string::npos) // all spaces? return string(""); else if(first_nonspace != 0) // has spaces at beginning? return str.substr(first_nonspace); else return str; } // // LStripZeroes // // Removes leading zeroes from a numeric string. // string LStripZeroes(string &str) { size_t first_nonzero = str.find_first_not_of('0'); if(first_nonzero == string::npos) // all zeroes? return string("0"); else if(first_nonzero != 0) // has zeroes at beginning? return str.substr(first_nonzero); else return str; } // // InputLineToVec // // Turns a line of fixed-length space-padded records into a set // void InputLineToVec(const string &input, const vector &recordLens, vector &outputVec) { size_t numRecords = recordLens.size(); // number of records to expect in the string size_t pos = 0; for(size_t i = 0; i < numRecords; i++) { outputVec.push_back(LStrip(RStrip(input.substr(pos, recordLens[i])))); pos += recordLens[i]; } } // // InputFileToVecMap // // Converts an input file into a vector > // Inputs: // filename - name of the file to read in. // recordLens - vector of record lengths on each line // fieldNames - vector of field names // void InputFileToVecMap(const char *filename, const vector &recordLens, const vector &recordNames, vector > &outputVecMap) { // load the file char *inputBuffer = LoadTextFile(filename); const char *inputRover = inputBuffer; string inputLine; int recnum = 0; if(!inputBuffer) FatalError("\nCouldn't load input file %s\n", filename); while(GetLineAsString(&inputRover, inputLine)) { vector fields; InputLineToVec(inputLine, recordLens, fields); // get vector of fields on line if(fields.size() != recordNames.size()) FatalError("\nNumber of fields on line does not match expected number of fields:\n %s\n", inputLine.c_str()); outputVecMap.push_back(map()); // add a new row to the vecmap map &back_map = outputVecMap.back(); // reference to newly-added map // For each record name, set back_map[name] = value for(size_t i = 0; i < recordNames.size(); i++) back_map[recordNames[i]] = fields[i]; progress(); } // done with input delete [] inputBuffer; } // All data maps for FDA drug schema vector > listings; vector > dosageforms; vector > firms; vector > formulations; vector > packages; vector > routes; vector > schedules; // // LoadListings // // Load the listings.TXT file which contains all the FDA-approved drugs. // void LoadListings() { vector recordLens; vector recordNames; printf("LoadListings "); recordLens.push_back(8); // listing_seq_op (id) recordLens.push_back(7); // lblcode (xref to firms) recordLens.push_back(5); // prodcode recordLens.push_back(11); // strength (compressable) recordLens.push_back(11); // unit (compressable) recordLens.push_back(2); // rx_otc recordLens.push_back(100); // name (compressable) recordNames.push_back("id"); recordNames.push_back("firm_id"); recordNames.push_back("prodcode"); recordNames.push_back("strength"); recordNames.push_back("unit"); recordNames.push_back("rx_otc"); recordNames.push_back("name"); InputFileToVecMap("listings.TXT", recordLens, recordNames, listings); printf("\n"); } // // LoadDoseForms // // Load the DOSEFORM.TXT file which contains dosage forms for each drug. (1:N) // Note that the dosage form textual description is included in this record and // should be omitted from the database as it can be xref'd from dosage types. // void LoadDoseForms() { vector recordLens; vector recordNames; printf("LoadDoseForms "); recordLens.push_back(8); // listing_seq_id (xref to listings) recordLens.push_back(4); // doseform (xref to dosage types) recordLens.push_back(240); // dosage_name (redundant with dosage types table!) recordNames.push_back("listing_id"); recordNames.push_back("dosage_types_id"); recordNames.push_back("dosage_name"); InputFileToVecMap("DOSEFORM.TXT", recordLens, recordNames, dosageforms); printf("\n"); } // // Load Firms // // Load the FIRMS.TXT file which contains FDA-registered drug manufacturers. // This includes a lot of address information that we're not interested in. // All we really want here are the lblcode firm id and name. // void LoadFirms() { vector recordLens; vector recordNames; printf("LoadFirms "); recordLens.push_back(7); // lblcode (firm id) recordLens.push_back(66); // firm_name recordLens.push_back(41); // addr_heading (don't want!) recordLens.push_back(41); // street (don't want!) recordLens.push_back(10); // po_box (don't want!) recordLens.push_back(41); // foreign_addr (don't want!) recordLens.push_back(31); // city (don't want!) recordLens.push_back(3); // state (don't want!) recordLens.push_back(10); // zip (don't want!) recordLens.push_back(31); // province (don't want!) recordLens.push_back(40); // country_name (don't want!) recordNames.push_back("id"); recordNames.push_back("firm_name"); recordNames.push_back("addr_heading"); recordNames.push_back("street"); recordNames.push_back("po_box"); recordNames.push_back("foreign_addr"); recordNames.push_back("city"); recordNames.push_back("state"); recordNames.push_back("zip"); recordNames.push_back("province"); recordNames.push_back("country_name"); InputFileToVecMap("FIRMS.TXT", recordLens, recordNames, firms); printf("\n"); } // // LoadFormulations // // Loads the FORMULAT.TXT file which contains active ingredient formulations for each drug (1:N). // The units and ingredient names need to be compressed into external lookups, as they repeat the // same values over and over. // void LoadFormulations() { vector recordLens; vector recordNames; printf("LoadFormulations "); recordLens.push_back(8); // listing_seq_no (xref to listings) recordLens.push_back(11); // strength recordLens.push_back(6); // unit (compress this to unique values!) recordLens.push_back(100); // ingredient_name (compress this to unique values!) recordNames.push_back("listing_id"); recordNames.push_back("strength"); recordNames.push_back("unit"); recordNames.push_back("ingredient_name"); InputFileToVecMap("FORMULAT.TXT", recordLens, recordNames, formulations); printf("\n"); } // // LoadSchedules // // Load the SCHEDULE.TXT file which contains DEA scheduling info for drugs. // void LoadSchedules() { vector recordLens; vector recordNames; printf("LoadSchedules "); recordLens.push_back(8); // listing_seq_no (xref to listings) recordLens.push_back(1); // schedule_no recordNames.push_back("listing_id"); recordNames.push_back("schedule_num"); InputFileToVecMap("SCHEDULE.TXT", recordLens, recordNames, schedules); printf("\n"); } // // LoadPackages // // Load the packages.txt file which contains packaging info for drugs (1:N) // package size and type are compressable into lookups. // void LoadPackages() { vector recordLens; vector recordNames; printf("LoadPackages "); recordLens.push_back(8); // listing_seq_no (xref to listings) recordLens.push_back(3); // pkgcode recordLens.push_back(26); // packsize (compressable) recordLens.push_back(25); // packtype (compressable) recordNames.push_back("listing_id"); recordNames.push_back("pkgcode"); recordNames.push_back("packsize"); recordNames.push_back("packtype"); InputFileToVecMap("packages.txt", recordLens, recordNames, packages); printf("\n"); } // // LoadRoutes // // Load the extremely wasteful ROUTES.TXT file which can be compressed by removing // the last field entirely in favor of the lookup. // void LoadRoutes() { vector recordLens; vector recordNames; printf("LoadRoutes "); recordLens.push_back(8); // listing_seq_no (xref to listings) recordLens.push_back(4); // route_id recordLens.push_back(240); // route_name (get rid of this, compress!) recordNames.push_back("listing_id"); recordNames.push_back("route_id"); recordNames.push_back("route_name"); InputFileToVecMap("ROUTES.TXT", recordLens, recordNames, routes); printf("\n"); } // // WriteInsertStatement // // Returns an SQL insert statement given a map of field names to values, the table name, and an optional id value to // add in as the first column where needed. // void WriteInsertStatement(FILE *output_file, const map &row, const string &table_name, bool needs_id, const string &id_val) { string output_str = string("insert into ") + table_name + " ("; if(needs_id) output_str += "id, "; for(map::const_iterator i_row = row.begin(); i_row != row.end(); ++i_row) { if(i_row != row.begin()) output_str += ", "; output_str += i_row->first; } output_str += ") values ("; if(needs_id) output_str += id_val + ", "; for(map::const_iterator i_row = row.begin(); i_row != row.end(); ++i_row) { if(i_row != row.begin()) output_str += ", "; output_str += "'" + i_row->second + "'"; } output_str += ");\n"; fputs(output_str.c_str(), output_file); } // // BuildUniqueLookup // // Give this a vecmap and a column name. // The output set will contain unique values from that column. // void BuildUniqueLookup(vector > &table, const char *column, map &output) { size_t starting_idx = 1; printf("Building lookup for field %s ", column); for(vector >::iterator i_row = table.begin(); i_row != table.end(); ++i_row) { map &row_map = *i_row; if(output.find(row_map[column]) == output.end()) { progress(); output.insert(pair(row_map[column], starting_idx)); ++starting_idx; } } printf("\n"); } // Lookups to build map listing_strengths; map listing_units; map listing_names; map dosage_types; map formula_units; map formula_names; map package_sizes; map package_types; map route_names; // // WriteLookup // // Write out a lookup table to the output script // void WriteLookup(const map &lookup, const char *table_name, const char *filename) { string table_create; string zero_ins; printf("Writing lookup table %s ", table_name); FILE *output_file = fopen(filename, "w"); if(!output_file) FatalError("\nCould not open %s for output\n", filename); table_create = string("create table ") + table_name + " (\n" " id integer not null primary key,\n" " item varchar(250)\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); zero_ins = string("insert into ") + table_name + " (id, item) values (0, '');\n\n"; fputs(zero_ins.c_str(), output_file); progress(); size_t record_count = 0; for(map::const_iterator i_row = lookup.begin(); i_row != lookup.end(); ++i_row) { stringstream ss; ss << i_row->second; string insert_this_item = string("insert into ") + table_name + " (id, item) values (" + ss.str() + ", '" + i_row->first + "');\n"; fputs(insert_this_item.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // GetLookupID // // Get the ID for a string in a lookup table. Errors out if not found. // size_t GetLookupID(map &lookup, string &key) { map::iterator i_item = lookup.find(key); if(i_item == lookup.end()) FatalError("Could not find lookup value '%s' in lookup\n", key.c_str()); return i_item->second; } // // OutputListings // // Write out the compacted listings table // void OutputListings() { string table_create; printf("Writing Listings table "); FILE *output_file = fopen("listings.sql", "w"); if(!output_file) FatalError("\nCould not open listings.sql for output\n"); table_create = "create table fda_listings (\n" " id integer not null primary key,\n" " firm_id integer,\n" " prodcode varchar(6),\n" " strength_id integer,\n" " listing_unit_id integer,\n" " rx_otc varchar(2),\n" " name_id integer\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = listings.begin(); i_lst != listings.end(); ++i_lst) { size_t strength = GetLookupID(listing_strengths, (*i_lst)["strength"]); size_t unit = GetLookupID(listing_units, (*i_lst)["unit"]); size_t name = GetLookupID(listing_names, (*i_lst)["name"]); stringstream strength_str; stringstream unit_str; stringstream name_str; strength_str << strength; unit_str << unit; name_str << name; string insert_str = string("insert into fda_listings (id, firm_id, prodcode, strength_id, listing_unit_id, rx_otc, name_id) values (") + (*i_lst)["id"] + ", " + LStripZeroes((*i_lst)["firm_id"]) + ", '" + (*i_lst)["prodcode"] + "', " + strength_str.str() + ", " + unit_str.str() + ", '" + (*i_lst)["rx_otc"] + "', " + name_str.str() + ");\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // OutputDoseForms // // Write out the compacted dose forms table. // void OutputDoseForms() { string table_create; printf("Writing Dose Forms table "); FILE *output_file = fopen("doseforms.sql", "w"); if(!output_file) FatalError("\nCould not open doseforms.sql for output\n"); table_create = "create table fda_doseforms (\n" " id integer not null primary key,\n" " listing_id integer,\n" " dosage_types_id integer\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = dosageforms.begin(); i_lst != dosageforms.end(); ++i_lst) { size_t dose_name = GetLookupID(dosage_types, (*i_lst)["dosage_name"]); stringstream id_str; stringstream dose_name_str; id_str << record_count; dose_name_str << dose_name; string insert_str = string("insert into fda_doseforms (id, listing_id, dosage_types_id) values (") + id_str.str() + ", " + (*i_lst)["listing_id"] + ", " + dose_name_str.str() + ");\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // OutputFirms // // Write out the compacted firms table. // void OutputFirms() { string table_create; printf("Writing Firms table "); FILE *output_file = fopen("firms.sql", "w"); if(!output_file) FatalError("\nCould not open firms.sql for output\n"); table_create = "create table fda_firms (\n" " id integer not null primary key,\n" " firm_name varchar(250)\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = firms.begin(); i_lst != firms.end(); ++i_lst) { string insert_str = string("insert into fda_firms (id, firm_name) values (") + LStripZeroes((*i_lst)["id"]) + ", '" + (*i_lst)["firm_name"] + "');\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // OutputFormulations // // Write out the formulations table // void OutputFormulations() { string table_create; printf("Writing Formulations table "); FILE *output_file = fopen("formulat.sql", "w"); if(!output_file) FatalError("\nCould not open formulat.sql for output\n"); table_create = "create table fda_formulations (\n" " id integer not null primary key,\n" " listing_id integer,\n" " strength varchar(12),\n" " unit_id integer,\n" " formula_name_id integer\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = formulations.begin(); i_lst != formulations.end(); ++i_lst) { size_t unit = GetLookupID(formula_units, (*i_lst)["unit"]); size_t name = GetLookupID(formula_names, (*i_lst)["ingredient_name"]); stringstream unit_str; stringstream name_str; stringstream id_str; id_str << record_count; unit_str << unit; name_str << name; string insert_str = string("insert into fda_formulations (id, listing_id, strength, unit_id, formula_name_id) values (") + id_str.str() + ", " + (*i_lst)["listing_id"] + ", '" + (*i_lst)["strength"] + "', " + unit_str.str() + ", " + name_str.str() + ");\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // OutputPackages // // Write out the packages table // void OutputPackages() { string table_create; printf("Writing Packages table "); FILE *output_file = fopen("packages.sql", "w"); if(!output_file) FatalError("\nCould not open packages.sql for output\n"); table_create = "create table fda_packages (\n" " id integer not null primary key,\n" " listing_id integer,\n" " pkgcode varchar(4),\n" " package_size_id integer,\n" " package_type_id integer\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = packages.begin(); i_lst != packages.end(); ++i_lst) { size_t psize = GetLookupID(package_sizes, (*i_lst)["packsize"]); size_t ptype = GetLookupID(package_types, (*i_lst)["packtype"]); stringstream size_str; stringstream type_str; stringstream id_str; id_str << record_count; size_str << psize; type_str << ptype; string insert_str = string("insert into fda_packages (id, listing_id, pkgcode, package_size_id, package_type_id) values (") + id_str.str() + ", " + (*i_lst)["listing_id"] + ", '" + (*i_lst)["pkgcode"] + "', " + size_str.str() + ", " + type_str.str() + ");\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // OutputRoutes // // Writes out the routes table // void OutputRoutes() { string table_create; printf("Writing Routes table "); FILE *output_file = fopen("routes.sql", "w"); if(!output_file) FatalError("\nCould not open routes.sql for output\n"); table_create = "create table fda_routes (\n" " id integer not null primary key,\n" " listing_id integer,\n" " route_id integer,\n" " route_name_id integer\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = routes.begin(); i_lst != routes.end(); ++i_lst) { size_t name = GetLookupID(route_names, (*i_lst)["route_name"]); stringstream name_str; stringstream id_str; id_str << record_count; name_str << name; string insert_str = string("insert into fda_routes (id, listing_id, route_id, route_name_id) values (") + id_str.str() + ", " + (*i_lst)["listing_id"] + ", " + LStripZeroes((*i_lst)["route_id"]) + ", " + name_str.str() + ");\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // OutputSchedules // // Writes out the DEA scheduling table // void OutputSchedules() { string table_create; printf("Writing Schedules table "); FILE *output_file = fopen("schedules.sql", "w"); if(!output_file) FatalError("\nCould not open schedules.sql for output\n"); table_create = "create table fda_schedules (\n" " id integer not null primary key,\n" " listing_id integer,\n" " schedule_num integer\n" ");\n" "commit;\n\n"; fputs(table_create.c_str(), output_file); progress(); size_t record_count = 0; for(vector >::iterator i_lst = schedules.begin(); i_lst != schedules.end(); ++i_lst) { stringstream id_str; id_str << record_count; string insert_str = string("insert into fda_schedules (id, listing_id, schedule_num) values (") + id_str.str() + ", " + (*i_lst)["listing_id"] + ", " + (*i_lst)["schedule_num"] + ");\n"; fputs(insert_str.c_str(), output_file); ++record_count; if(record_count % 200 == 0) // commit every 200 records fprintf(output_file, "commit; /* COMMIT after record #%u */\n", record_count); progress(); } if(record_count % 200 != 0) // If didn't just write a commit, do one now fprintf(output_file, "commit; /* FINAL COMMIT @ record #%u */\n", record_count); fclose(output_file); printf("\n"); } // // main program // int _tmain(int argc, _TCHAR* argv[]) { // Load all the input files LoadListings(); LoadDoseForms(); LoadFirms(); LoadFormulations(); LoadSchedules(); LoadPackages(); LoadRoutes(); // Build lookups BuildUniqueLookup(listings, "strength", listing_strengths); BuildUniqueLookup(listings, "unit", listing_units); BuildUniqueLookup(listings, "name", listing_names); BuildUniqueLookup(dosageforms, "dosage_name", dosage_types); BuildUniqueLookup(formulations, "unit", formula_units); BuildUniqueLookup(formulations, "ingredient_name", formula_names); BuildUniqueLookup(packages, "packsize", package_sizes); BuildUniqueLookup(packages, "packtype", package_types); BuildUniqueLookup(routes, "route_name", route_names); // Write lookup tables WriteLookup(listing_strengths, "listing_strengths", "listing_strengths.sql"); WriteLookup(listing_units, "listing_units", "listing_units.sql"); WriteLookup(listing_names, "listing_names", "listing_names.sql"); WriteLookup(dosage_types, "dosage_types", "dosage_types.sql"); WriteLookup(formula_units, "formula_units", "formula_units.sql"); WriteLookup(formula_names, "formula_names", "formula_names.sql"); WriteLookup(package_sizes, "package_sizes", "package_sizes.sql"); WriteLookup(package_types, "package_types", "package_types.sql"); WriteLookup(route_names, "route_names", "route_names.sql"); // Output tables OutputListings(); OutputDoseForms(); OutputFirms(); OutputFormulations(); OutputPackages(); OutputRoutes(); OutputSchedules(); return 0; }