///////////////////////////////////////////////////////////////////////////////// // MKCOR: a program that elaborates word corpora from ASCII text files // // // // Copyright (C) 1999/2000 by Marcel Schmuki // // // // This program is free software; you can redistribute it and/or // // modify it under the terms of the GNU General Public License // // as published by the Free Software Foundation; either version 2 // // of the License, or (at your option) any later version. // // // This program is distributed in the hope that it will be useful, // // but WITHOUT ANY WARRANTY; without even the implied warranty of // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // // GNU General Public License for more details. // // // You should have received a copy of the GNU General Public License // // along with this program; if not, write to the Free Software // // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // ///////////////////////////////////////////////////////////////////////////////// // mkcor.cc given a text the program establishes a list in which // every word appears only one time. The program indicates // for each word the number of occurencies as well as the // corresponding lines #include #include #include #define WORD_LEN 15 #define MAXLINEBUFFLEN 300 char stoplist[] = " .,;?!\"':-*/()[]{}|#$%&=^`~_<>\\@+\t"; // characters that do not belong to words // important: \n may not be in that list (doesn't work with strchr()) int debug = 0; int alphabetical = 0; int upper = 0; // convert to upper case int lower = 0; // convert to lower case (if both are 0 => default treatment) int no_accent = 0; int special_convert = 0; char last_option = ' '; char sep1[10] = "\t\t"; char sep2[10] = "\t"; char sep3[10] = " "; char out_file[50] = "CORPUS.OUT"; char ctable1[50] = ""; char ctable2[50] = ""; char filename[50] = ""; FILE* file_in; FILE* file_out; typedef struct occelement * occurrence; struct occelement { int line; occurrence next; occelement( int line ) { this->line = line; next = NULL; } ; }; typedef struct olist * occurrence_list; struct olist { occurrence head; occurrence sentinel; olist( int line ); ~olist( void ); void back_insert( int line ); }; olist::olist( int line ) { head = sentinel = new occelement( line ); } olist::~olist( void ) { occurrence temp; while (head!=sentinel) { temp = head->next; delete head; head = temp; } delete head; } void olist::back_insert( int line ) { sentinel->line = line; sentinel->next = new occelement( line ); sentinel = sentinel->next; } typedef struct lelement * element; struct lelement { char * word; int number; occurrence_list where; element next; lelement( int line ); ~lelement( void ); }; lelement::lelement( int line ) { word = new char[WORD_LEN]; number = 1; where = new olist( line ); next = NULL; } lelement::~lelement( void ) { delete[] word; } typedef struct wlist * list; struct wlist { element head; element sentinel; wlist( void ); ~wlist( void ); element find_element( char* w ); element find_alphabetical_insert( char* w ); void back_insert( char* w, int line ); void direct_insert( element e, char* w, int line ); void alphabetical_insert( char* w, int line); void insert( char* w, int line ); }; list word_list; wlist::wlist( void ) { head = sentinel = new lelement(0); } wlist::~wlist( void ) { element temp; while (head != sentinel) { temp = head; head = head->next; delete temp; } delete head; head = sentinel = NULL; // not really necessary... } element wlist::find_element( char* w ) { element temp = head; while ((temp!=sentinel) && (strcmp(temp->word, w) != NULL)) temp = temp->next; if (temp==sentinel) return NULL; else return temp; } element wlist::find_alphabetical_insert( char* w ) { element temp = head; element old = NULL; while ((temp!=sentinel) && (strcmp( w, temp->word ) > 0)) { old = temp; temp=temp->next; } return old; } void wlist::back_insert( char* w, int line ) { strcpy( sentinel->word, w ); sentinel->where->back_insert(line); sentinel->next = new lelement( 0 ); //changed sentinel = sentinel->next; } void wlist::direct_insert( element e, char* w, int line ) { // x-[e]-y => x-e-[w]-y element temp = e->next; e->next = new lelement( 1 ); e->next->where->back_insert(line); strcpy(e->next->word, w); e->next->next = temp; } void wlist::alphabetical_insert( char* w, int line ) { element e = find_alphabetical_insert( w ); if (head == sentinel) wlist::back_insert( w, line); else if (e == NULL) { e = head; head = new lelement( 1 ); head->where->back_insert( line ); strcpy(head->word, w); head->next = e; } else wlist::direct_insert( e, w, line); } void wlist::insert(char* w, int line) { element temp; if (temp = find_element(w)) { temp->number++; temp->where->back_insert( line ); } else { if (alphabetical) wlist::alphabetical_insert( w, line ); else back_insert(w, line); } } void command_line_args( int anz, char* arg[] ) { int x, y; if (anz<2) { printf("Supply at least a filename...!\n"); } else { strcpy(filename, arg[anz-1]); for (x=1; x e)\n"); printf("-i \"list\" Override default list of elements to ignore\n"); printf("-1 -2 -3 \"sep\" Override default separators (e.g. for spreadsheet programs)\n"); printf("-o \"file\" Output file (default is CORPUS.OUT)\n\n"); printf("-x \"table1\" Special conversion: characters in table1 are replaced\n"); printf("-y \"table2\" by those in table2 (allways use -x and -y together!)\n"); printf("\nOptions can be combined, e.g. MKCOR -alo test.out test.txt\n"); } void open_file( void ) { if ((file_in = fopen( filename, "rt" )) == NULL) { printf("Error opening file! Execution aborted\n"); exit(1); } } char convert_upper_lower( char c ) { if (lower) return tolower(c); else if (upper) return toupper(c); else return c; } char convert_accent( char c ) { if (no_accent) { char table1[] = ""; char table2[] = "aaeeiioouuAAEEIIOOUU"; char* pos = strchr(table1,c); if (pos) return table2[(pos-table1)]; else return c; } else return c; } char special_conversion( char c ) { if (special_convert) { char* pos = strchr(ctable1,c); if (pos) return ctable2[(pos-ctable1)]; else return c; } else return c; } char convert( char c ) { return special_conversion( convert_accent( convert_upper_lower( c ))); } void read_and_insert( void ) { char line[MAXLINEBUFFLEN]; char item[WORD_LEN]; int x = 0, y = 0; int reading = 0; int act_line = 0; while (!feof(file_in)) { strcpy( line, ""); // to be sure that line is empty ... (some errors occurred..) act_line++; fgets( line, MAXLINEBUFFLEN, file_in ); for (x=0; x<=strlen(line); x++) { if ((!strchr(stoplist,line[x])) && (line[x] != '\n')) reading = 1; if (reading) { if ((!strchr(stoplist,line[x])) && (line[x] !='\n')) // special treatment for \n item[y++] = convert(line[x]); else { // item[y++] = '<'; item[y] = '\0'; word_list->insert(item, act_line); debug && printf("%s\n", item); reading = y = 0; } } } y = 0; } } void close_in_file( void ) { fclose(file_in); } void open_and_write_out_file( void ) { FILE* file_out; if ((file_out = fopen( out_file, "wt")) == NULL) printf("Error opening out-file! Execution aborted\n"); else { element temp = word_list->head; while (temp!=word_list->sentinel) { fprintf( file_out, "%s%s%i%s", temp->word, sep1, temp->number, sep2); occurrence tempo = temp->where->head; while (tempo!=temp->where->sentinel) { fprintf( file_out, "%i%s", tempo->line, sep3); tempo = tempo->next; } fprintf( file_out, "\n"); temp = temp->next; } } fclose(file_out); } void dispose_all_memory( void ) { element tempe; occurrence tempo; while (word_list->head!=word_list->sentinel) { tempe = word_list->head; while (tempe->where->head!=tempe->where->sentinel) { tempo = tempe->where->head->next; delete tempe->where->head; tempe->where->head = tempo; } delete tempe->where->head; tempe = word_list->head->next; delete word_list->head; word_list->head = tempe; } delete word_list->head; } int main( int anz, char* arg[] ) { word_list = new wlist(); print_message(); command_line_args( anz, arg ); open_file(); read_and_insert(); close_in_file(); open_and_write_out_file(); dispose_all_memory(); return 0; }