(非常急迫6/5前需要解答,請大家幫忙) File Reverse in C
問題是要讀一個2GB的文字檔(中英數字混雜)到output.txt,其output.txt的內容要為2GB文字檔的reverse(ex:123 頂客論壇 -> 壇論客頂 321),此外要先判斷這個文字檔是否為UTF8或是UTF8 BOM或是ASCII或是big5,之後才進行reverse的動作。下面是我的程式碼:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
const char *UTF_16_BE_BOM = "\xFE\xFF";
const char *UTF_16_LE_BOM = "\xFF\xFE";
const char *UTF_8_BOM = "\xEF\xBB\xBF";
const char *UTF_32_BE_BOM = "\x00\x00\xFE\xFF";
const char *UTF_32_LE_BOM = "\xFF\xFE\x00\x00";
// Check Big5
bool IsBig5(const char *data)
{
unsigned char a ;
unsigned char b ;
a = *data;
b = *(data+1);
return ( ( (a>=0xA4 && a<=0xC6) || (a>=0xC9 && a<=0xF9) ) &&
( (b>=0x40 && b<=0x7E) || (b>=0xA1 && b<=0xFE)));
}
// Check if a document is encoded to ASCII is simple: test if the bit 7 of all bytes is unset (0b0xxxxxxx)
int isASCII(const char *data, size_t size)
{
const unsigned char *str = (const unsigned char*)data;
const unsigned char *end = str + size;
for (; str != end; str++) {
if (*str & 0x80)
return 0;
}
return 1;
}
// Check for BOM markers
char* check_bom(const char *data, size_t size)
{
if (size >= 3) {
if (memcmp(data, UTF_8_BOM, 3) == 0)
return "UTF-8";
}
if (size >= 4) {
if (memcmp(data, UTF_32_LE_BOM, 4) == 0)
return "UTF-32-LE";
if (memcmp(data, UTF_32_BE_BOM, 4) == 0)
return "UTF-32-BE";
}
if (size >= 2) {
if (memcmp(data, UTF_16_LE_BOM, 2) == 0)
return "UTF-16-LE";
if (memcmp(data, UTF_16_BE_BOM, 2) == 0)
return "UTF-16-BE";
}
return NULL;
}
// Check UTF-8
int isUTF8(const char *data, size_t size)
{
const unsigned char *str = (unsigned char*)data;
const unsigned char *end = str + size;
unsigned char byte;
unsigned int code_length, i;
uint32_t ch;
while (str != end) {
byte = *str;
if (byte <= 0x7F) {
/* 1 byte sequence: U+0000..U+007F */
str += 1;
continue;
}
if (0xC2 <= byte && byte <= 0xDF)
/* 0b110xxxxx: 2 bytes sequence */
code_length = 2;
else if (0xE0 <= byte && byte <= 0xEF)
/* 0b1110xxxx: 3 bytes sequence */
code_length = 3;
else if (0xF0 <= byte && byte <= 0xF4)
/* 0b11110xxx: 4 bytes sequence */
code_length = 4;
else {
/* invalid first byte of a multibyte character */
return 0;
}
if (str + (code_length - 1) >= end) {
/* truncated string or invalid byte sequence */
return 0;
}
/* Check continuation bytes: bit 7 should be set, bit 6 should be
* unset (b10xxxxxx). */
for (i=1; i < code_length; i++) {
if ((str & 0xC0) != 0x80)
return 0;
}
if (code_length == 2) {
/* 2 bytes sequence: U+0080..U+07FF */
ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f);
/* str[0] >= 0xC2, so ch >= 0x0080.
str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <= 0x07ff */
} else if (code_length == 3) {
/* 3 bytes sequence: U+0800..U+FFFF */
ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) +
(str[2] & 0x3f);
/* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff & 0x3f) = 0xffff,
so ch <= 0xffff */
if (ch < 0x0800)
return 0;
/* surrogates (U+D800-U+DFFF) are invalid in UTF-8:
test if (0xD800 <= ch && ch <= 0xDFFF) */
if ((ch >> 11) == 0x1b)
return 0;
} else if (code_length == 4) {
/* 4 bytes sequence: U+10000..U+10FFFF */
ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) +
((str[2] & 0x3f) << 6) + (str[3] & 0x3f);
if ((ch < 0x10000) || (0x10FFFF < ch))
return 0;
}
str += code_length;
}
return 1;
}
int main(int argc, char *argv[]) { //這個主程式已經可以file reverse,但需要在前面先判斷文字檔編碼
int source, dest, n;
char data;
int filesize;
int i;
if (argc != 3) {
fprintf(stderr, "usage %s <source> <dest>", argv[0]);
exit(-1);
}
if ((source = open(argv[1], 0400)) < 0) { //read permission for user on source
fprintf(stderr, "can't open source");
exit(-1);
}
if ((dest = creat(argv[2], 0700)) < 0) { //rwx permission for user on dest
fprintf(stderr, "can't create dest");
exit(-1);
}
filesize = lseek(source, (off_t) 0, SEEK_END); //filesize is lastby +offset
printf("Source file size is %d\n", filesize);
for (i = filesize - 1; i >= 0; i--) { //read byte by byte from end
lseek(source, (off_t) i, SEEK_SET);
n = read(source, &data, 1);
if (n != 1) {
fprintf(stderr, "can't read 1 byte");
exit(-1);
}
n = write(dest, &data, 1);
if (n != 1) {
fprintf(stderr, "can't write 1 byte");
exit(-1);
}
}
write(STDOUT_FILENO, "DONE\n", 5);
fclose(source);
fclose(dest);
return 0;
}