Filter text
Solution
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
// processing words read from file and save into the array
int load(ifstream&in, string words[]);
// save the words into the output stream
void list1(string words[], constint count, ofstream&out);
void list2(string words[], constint counts[], constint count, ofstream&out);
// sort the words alphabetically using bubble sort
void sort(string words[], constint count);
// merge the same words into one and save the count into the array
int merge(string words[], constint count, int counts[]);
int main() {
// ask the user to enter the input file name,
// assuming it ends with .txt, then generate the output file name.
string input, output1, output2;
cout<< “Enter the name of the input file (.txt): “;
cin>> input;
intlen = input.length();
output1 = input.substr(0, len – 4) + “_lower.txt”;
output2 = input.substr(0, len – 4) + “_clc.txt”;
// open the files as input and output streams
ifstream in(input);
ofstream out1(output1);
ofstream out2(output2);
// load word (processed) from input stream to an array
string words[1000];
int count = load(in, words);
// save the count into the output streams
out1 << “text size : ” << count <<endl;
out2 << “text size : ” << count <<endl;
// save words into the first output file.
list1(words, count, out1);
// sort words
sort(words, count);
// merge same words into one, then save the count into the array
int counts[1000];
count = merge(words, count, counts);
// save the result into the second output file
out2 << “vocab size : ” << count <<endl;
list2(words, counts, count, out2);
// close file streams
in.close();
out1.close();
out2.close();
cout<< “Output is saved into file ” << output1 << ” and ” << output2 <<endl;
return 0;
}
boolispunct(char c) {
return c == ‘\” || c == ‘,’ || c == ‘.’ || c == ‘-‘ || c == ‘_’ ||
c == ‘<‘ || c == ‘>’ || c == ‘(‘ || c == ‘)’ || c == ‘!’ ||
c == ‘?’ || c == ‘\\’ || c == ‘”‘ || c == ‘;’ || c == ‘:’;
}
// processing words read from file and save into the array
int load(ifstream&in, string words[]) {
int count = 0;
string token;
char word[21]; // assume each word has at most 20 characters
while (in >> token) { // read until the end of line
// process this word, remove punctuations and split into different words
int i = 0;
while (i <token.length()) {
int k = 0;
// read a word (until a punctuation is found).
while (i <token.length() && !ispunct(token[i])) {
char c = token[i];
if (c >= ‘A’ && c <= ‘Z’) // to lower case
c = c – ‘A’ + ‘a’;
word[k] = c;
k ++;
i ++;
}
i ++; // skip the punctuation
if (k > 0) { // add the word into the words list
word[k] = ‘\0’;
words[count] = string(word);
count ++;
}
}
}
return count;
}
// save the words into the output stream
void list1(string words[], constint count, ofstream&out) {
for (int i = 0; i < count; i++) {
out<< words[i] <<endl;
}
}
void list2(string words[], constint counts[], constint count, ofstream&out) {
for (int i = 0; i < count; i++) {
out<< words[i] << ” ” << counts[i] <<endl;
}
}
// sort the words alphabetically using bubble sort
void sort(string words[], constint count) {
for (int i = count; i > 1; i–) {
for (int j = 0; j < i – 1; j++) {
if (words[j] > words[j + 1]) {
// swap the two words, so the smaller one comes first
string word = words[j];
words[j] = words[j + 1];
words[j + 1] = word;
}
}
}
}
// merge the same words into one and save the count into the array
int merge(string words[], constint count, int counts[]) {
int k = 0;
if (count > 0) { // initialize
counts[0] = 1;
k ++;
}
for (int i = 1; i < count; i++) {
if (words[i] == words[k – 1]) {
// still the same word, simply increase the count
counts[k – 1] ++;
} else {
// different word, move right after [k] so the duplicate words
// between i and k are removed
words[k] = words[i];
counts[k] = 1;
k ++;
}
}
return k;
}