How to parse a TAR file?

Q. From a tar file how to list it’s contents in the form of filename and their sizes?

Ans:

Details of the tar file format can be found here: http://en.wikipedia.org/wiki/Tar_%28computing%29

Original Tar file format had two problems:
1. archiving file of size greater than 8 GB
2. archiving file whose name is > 100 bytes.

Below is the tar header format:

{
    char name[100];
    char mode[8];
    char uid[8];
    char gid[8];
    unsigned char size[12];
    char mtime[12];
    char chksum[8];
    char typeflag;
    char linkname[100];
    char magic[6];
    char version[2];
    char uname[32];
    char gname[32];
    char devmajor[8];
    char devminor[8];
    char prefix[155];
    char pad[12];
}

These problems later got fixed in GNU tar format.

a. For parsing file size use below rules:
Numeric values are encoded in octal numbers using ASCII digits, with leading zeroes. For historical reasons, a final NUL or space character should be used. Thus although there are 12 bytes reserved for storing the file size, only 11 octal digits can be stored. This gives a maximum file size of 8 gigabytes on archived files. To overcome this limitation, star in 2001 introduced a base-256 coding that is indicated by setting the high-order bit of the leftmost byte of a numeric field. GNU-tar and BSD-tar followed this idea.

b. For parsing file name use below rules:
If the first character of prefix is \0 (null character), the file name is name; otherwise, it is prefix/name. Files whose pathnames don’t fit in that length can not be stored in a tar archive.

c. LongLink rule:
I found that though rule ‘b’ is documented it is not followed. Large filenames are stored in tar using LongLink concept. It is special tar entry used to store only large filenames. The LongLink type header has typeflag ‘L’. The file name for this header is “././@LongLink“. The data content for this file are nothing but the “long” filename for the next archive entry.

To generate a “gnu” format tar use below command in shell:

$ tar --format=gnu -cvf xyz.tar dir

Following code parse a “gnu” tar file and print the file name & file size of all the archived files.

struct GnuTarHeader
{
    char name[100];
    char mode[8];
    char uid[8];
    char gid[8];
    unsigned char size[12];
    char mtime[12];
    char chksum[8];
    char typeflag;
    char linkname[100];
    char magic[6];
    char version[2];
    char uname[32];
    char gname[32];
    char devmajor[8];
    char devminor[8];
    char prefix[155];
    char pad[12];
};
 
void validateTarHeader(GnuTarHeader *tarHeader);
void parseFileSize(GnuTarHeader *tarHeader);
void parseFileName(GnuTarHeader *tarHeader);
void parseLongLink(GnuTarHeader *tarHeader, int fd);
void parseTarHeader(GnuTarHeader *tarHeader);
 
std::string currentFileName;
unsigned long long currentFileSize;
bool lastLongLinkHeader;
 
char TAR_MAGIC[] = "ustar ";
 
int main(int argc, char **argv)
{
    int fd;
    int ret;
    unsigned long long seek;
    GnuTarHeader gnuHeader, emptyHeader;
    int emptyHeaders = 0;
 
    if (argc != 2) {
        printf ("Usage: %s tar_file_name\n", argv[0]);
        return 1;
    }
 
    fd = open(argv[1], O_RDONLY);
    assert (fd != -1);
 
    memset (&emptyHeader, 0, 512);
 
    while (1) {
        ret = read(fd, &gnuHeader, 512);
        assert(ret == 512);
        if (0 == memcmp(&gnuHeader, &emptyHeader, 512)) {
            emptyHeaders ++;
            if (2 == emptyHeaders) {
                break;
            }
            continue;
        }
        emptyHeaders = 0;
        validateTarHeader(&gnuHeader);
        if ('L' == gnuHeader.typeflag) {
            parseLongLink(&gnuHeader, fd);
        } else {
            parseTarHeader(&gnuHeader);
            seek = (currentFileSize/512) + (currentFileSize%512 ? 1 : 0);
            seek *= 512;
            seek = lseek(fd, seek, SEEK_CUR);
            assert(seek != -1);
        }
    }
 
    return 0;
}
 
void validateTarHeader(GnuTarHeader *tarHeader)
{
    for (int i=0; i<6; i++) {
        assert(tarHeader->magic[i] == TAR_MAGIC[i]);
    }
}
 
void parseFileSize(GnuTarHeader *tarHeader)
{
    int i;
 
    // parse the file size.
    currentFileSize = 0;
 
    if (tarHeader->size[0] & (0X01 << 7)) {
        // file size > 8 GB.
        for (i=1; i<12; i++) {
            currentFileSize *= 256;
            currentFileSize += tarHeader->size[i];
        }
    } else {
        // file size < 8 GB.
        for (i=0; i<12; i++) {
            if ((0 == tarHeader->size[i]) || (' ' == tarHeader->size[i])) {
                continue;
            }
            currentFileSize *= 8;
            currentFileSize += (tarHeader->size[i] - '0');
        }
    }
}
 
void parseFileName(GnuTarHeader *tarHeader)
{
    int i;
    char fileName[256];
 
    currentFileName = "";
 
    if (0 != tarHeader->prefix[0]) {
        for (i=0; i<155; i++) {
            if (0 == tarHeader->prefix[i]) {
                break;
            }
            fileName[i] = tarHeader->prefix[i];
        }
        fileName[i] = '\0';
        currentFileName = fileName;
        currentFileName += "//";
    }
 
    for (i=0; i<100; i++) {
        if (0 == tarHeader->name[0]) {
            break;
        }
        fileName[i] = tarHeader->name[i];
    }
 
    fileName[i] = '\0';
    currentFileName += fileName;
}
 
void parseLongLink(GnuTarHeader *tarHeader, int fd)
{
    int ret;
    char fileName[512+1]; // last byte for '\0''
 
    currentFileName = "";
    parseFileSize(tarHeader);
    while (true) {
        ret = read (fd, fileName, 512);
        if (currentFileSize > 512) {
            fileName[512] = '\0';
        } else {
            fileName[currentFileSize] = '\0';
            currentFileName += fileName;
            break;
        }
        currentFileSize -= 512;
        currentFileName += fileName;
    }
 
    lastLongLinkHeader = true;
}
 
void parseTarHeader(GnuTarHeader *tarHeader)
{
    parseFileSize(tarHeader);
 
    // parse the filename.
    if (false == lastLongLinkHeader) {
        parseFileName(tarHeader);
    }
 
    lastLongLinkHeader = false;
    printf ("%s %llu\n", currentFileName.c_str(), currentFileSize);
}