I need a bit of guidance with writing a C program...a bit of quick background as to my level, I've programmed in Java previously, but this is my first time programming in C, and we've been tasked to translate a word count program from Java to C that consists of the following:
Read a file from memory
Count the words in the file
For each occurrence of a unique word, keep a word counter variable
Print out the top ten most frequent words and their corresponding occurrences
Here's the source program in Java:
package lab0;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
public class WordCount {
private ArrayList<WordCountNode> outputlist = null;
public WordCount(){
this.outputlist = new ArrayList<WordCountNode>();
}
/**
* Read the file into memory.
*
* @param filename name of the file.
* @return content of the file.
* @throws Exception if the file is too large or other file related exception.
*/
public char[] readFile(String filename) throws Exception{
char [] result = null;
File file = new File(filename);
long size = file.length();
if (size > Integer.MAX_VALUE){
throw new Exception("File is too large");
}
result = new char[(int)size];
FileReader reader = new FileReader(file);
int len, offset = 0, size2read = (int)size;
while(size2read > 0){
len = reader.read(result, offset, size2read);
if(len == -1)
break;
size2read -= len;
offset += len;
}
return result;
}
/**
* Make article word by word.
*
* @param article the content of file to be counted.
* @return string contains only letters and "'".
*/
private enum SPLIT_STATE {IN_WORD, NOT_IN_WORD};
/**
* Go through article, find all the words and add to output list
* with their count.
*
* @param article the content of the file to be counted.
* @return words in the file and their counts.
*/
public ArrayList<WordCountNode> countWords(char[] article){
SPLIT_STATE state = SPLIT_STATE.NOT_IN_WORD;
if(null == article)
return null;
char curr_ltr;
int curr_start = 0;
for(int i = 0; i < article.length; i++){
curr_ltr = Character.toUpperCase( article[i]);
if(state == SPLIT_STATE.IN_WORD){
article[i] = curr_ltr;
if ((curr_ltr < 'A' || curr_ltr > 'Z') && curr_ltr != '\'') {
article[i] = ' ';
//printf("\nthe word is %s\n\n",curr_start);
if(i - curr_start < 0){
System.out.println("i = " + i + " curr_start = " + curr_start);
}
addWord(new String(article, curr_start, i-curr_start));
state = SPLIT_STATE.NOT_IN_WORD;
}
}else{
if (curr_ltr >= 'A' && curr_ltr <= 'Z') {
curr_start = i;
article[i] = curr_ltr;
state = SPLIT_STATE.IN_WORD;
}
}
}
return outputlist;
}
/**
* Add the word to output list.
*/
public void addWord(String word){
int pos = dobsearch(word);
if(pos >= outputlist.size()){
outputlist.add(new WordCountNode(1L, word));
}else{
WordCountNode tmp = outputlist.get(pos);
if(tmp.getWord().compareTo(word) == 0){
tmp.setCount(tmp.getCount() + 1);
}else{
outputlist.add(pos, new WordCountNode(1L, word));
}
}
}
/**
* Search the output list and return the position to put word.
* @param word is the word to be put into output list.
* @return position in the output list to insert the word.
*/
public int dobsearch(String word){
int cmp, high = outputlist.size(), low = -1, next;
// Binary search the array to find the key
while (high - low > 1) {
next = (high + low) / 2;
// all in upper case
cmp = word.compareTo((outputlist.get(next)).getWord());
if (cmp == 0)
return next;
else if (cmp < 0)
high = next;
else
low = next;
}
return high;
}
public static void main(String args[]){
// handle input
if (args.length == 0){
System.out.println("USAGE: WordCount <filename> [Top # of results to display]\n");
System.exit(1);
}
String filename = args[0];
int dispnum;
try{
dispnum = Integer.parseInt(args[1]);
}catch(Exception e){
dispnum = 10;
}
long start_time = Calendar.getInstance().getTimeInMillis();
WordCount wordcount = new WordCount();
System.out.println("Wordcount: Running...");
// read file
char[] input = null;
try {
input = wordcount.readFile(filename);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.exit(1);
}
// count all word
ArrayList<WordCountNode> result = wordcount.countWords(input);
long end_time = Calendar.getInstance().getTimeInMillis();
System.out.println("wordcount: completed " + (end_time - start_time)/1000000 +
"." + (end_time - start_time)%1000000 + "(s)");
System.out.println("wordsort: running ...");
start_time = Calendar.getInstance().getTimeInMillis();
Collections.sort(result);
end_time = Calendar.getInstance().getTimeInMillis();
System.out.println("wordsort: completed " + (end_time - start_time)/1000000 +
"." + (end_time - start_time)%1000000 + "(s)");
Collections.reverse(result);
System.out.println("\nresults (TOP "+ dispnum +" from "+ result.size() +"):\n" );
// print out result
String str ;
for (int i = 0; i < result.size() && i < dispnum; i++){
if(result.get(i).getWord().length() > 15)
str = result.get(i).getWord().substring(0, 14);
else
str = result.get(i).getWord();
System.out.println(str + " - " + result.get(i).getCount());
}
}
public class WordCountNode implements Comparable{
private String word;
private long count;
public WordCountNode(long count, String word){
this.count = count;
this.word = word;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public long getCount() {
return count;
}
public void setCount(long count) {
this.count = count;
}
public int compareTo(Object arg0) {
// TODO Auto-generated method stub
WordCountNode obj = (WordCountNode)arg0;
if( count - obj.getCount() < 0)
return -1;
else if( count - obj.getCount() == 0)
return 0;
else
return 1;
}
}
}
Here's my attempt (so far) in C:
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
// Read in a file
FILE *readFile (char filename[]) {
FILE *inputFile;
inputFile = fopen (filename, "r");
if (inputFile == NULL) {
printf ("File could not be opened.\n");
exit (EXIT_FAILURE);
}
return inputFile;
}
// Return number of words in an array
int wordCount (FILE *filePointer, char filename[]) {//, char *words[]) {
// count words
int count = 0;
char temp;
while ((temp = getc(filePointer)) != EOF) {
//printf ("%c", temp);
if ((temp == ' ' || temp == '\n') && (temp != '\''))
count++;
}
count += 1; // counting method uses space AFTER last character in word - the last space
// of the last character isn't counted - off by one error
// close file
fclose (filePointer);
return count;
}
// Print out the frequencies of the 10 most frequent words in the console
int main (int argc, char *argv[]) {
/* Step 1: Read in file and check for errors */
FILE *filePointer;
filePointer = readFile (argv[1]);
/* Step 2: Do a word count to prep for array size */
int count = wordCount (filePointer, argv[1]);
printf ("Number of words is: %i\n", count);
/* Step 3: Create a 2D array to store words in the file */
// open file to reset marker to beginning of file
filePointer = fopen (argv[1], "r");
// store words in character array (each element in array = consecutive word)
char allWords[count][100]; // 100 is an arbitrary size - max length of word
int i,j;
char temp;
for (i = 0; i < count; i++) {
for (j = 0; j < 100; j++) { // labels are used with goto statements, not loops in C
temp = getc(filePointer);
if ((temp == ' ' || temp == '\n' || temp == EOF) && (temp != '\'') ) {
allWords[i][j] = '\0';
break;
} else {
allWords[i][j] = temp;
}
printf ("%c", allWords[i][j]);
}
printf ("\n");
}
// close file
fclose (filePointer);
/* Step 4: Use a simple selection sort algorithm to sort 2D char array */
// PStep 1: Compare two char arrays, and if
// (a) c1 > c2, return 2
// (b) c1 == c2, return 1
// (c) c1 < c2, return 0
qsort(allWords, count, sizeof(char[][]), pstrcmp);
/*
int k = 0, l = 0, m = 0;
char currentMax, comparedElement;
int max; // the largest element in the current 2D array
int elementToSort = 0; // elementToSort determines the element to swap with starting from the left
// Outer a iterates through number of swaps needed
for (k = 0; k < count - 1; k++) { // times of swaps
max = k; // max element set to k
// Inner b iterates through successive elements to fish out the largest element
for (m = k + 1; m < count - k; m++) {
currentMax = allWords[k][l];
comparedElement = allWords[m][l];
// Inner c iterates through successive chars to set the max vars to the largest
for (l = 0; (currentMax != '\0' || comparedElement != '\0'); l++) {
if (currentMax > comparedElement)
break;
else if (currentMax < comparedElement) {
max = m;
currentMax = allWords[m][l];
break;
}
else if (currentMax == comparedElement)
continue;
}
}
// After max (count and string) is determined, perform swap with temp variable
char swapTemp[1][20];
int y = 0;
do {
swapTemp[0][y] = allWords[elementToSort][y];
allWords[elementToSort][y] = allWords[max][y];
allWords[max][y] = swapTemp[0][y];
} while (swapTemp[0][y++] != '\0');
elementToSort++;
}
*/
int a, b;
for (a = 0; a < count; a++) {
for (b = 0; (temp = allWords[a][b]) != '\0'; b++) {
printf ("%c", temp);
}
printf ("\n");
}
// Copy rows to different array and print results
/*
char arrayCopy [count][20];
int ac, ad;
char tempa;
for (ac = 0; ac < count; ac++) {
for (ad = 0; (tempa = allWords[ac][ad]) != '\0'; ad++) {
arrayCopy[ac][ad] = tempa;
printf("%c", arrayCopy[ac][ad]);
}
printf("\n");
}
*/
/* Step 5: Create two additional arrays:
(a) One in which each element contains unique words from char array
(b) One which holds the count for the corresponding word in the other array */
/* Step 6: Sort the count array in decreasing order, and print the corresponding
array element as well as word count in the console */
return 0;
}
// Perform housekeeping tasks like freeing up memory and closing file
I'm really stuck on the selection sort algorithm. I'm currently using 2D arrays to represent strings, and that worked out fine, but when it came to sorting, using three level nested loops didn't seem to work, I tried to use qsort instead, but I don't fully understand that function as well.
Constructive feedback and criticism greatly welcome (...and needed)!