Source of file EnsembleFastaEntry.php

Size: 4,612 Bytes - Last Modified: 2019-06-14T10:51:52+01:00

src/Reader/FastaEntry/EnsembleFastaEntry.php

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
<?php
/**
 * Copyright 2019 University of Liverpool
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
namespace pgb_liv\php_ms\Reader\FastaEntry;

use pgb_liv\php_ms\Core\Entry\DatabaseEntry;
use pgb_liv\php_ms\Core\Gene;
use pgb_liv\php_ms\Core\Protein;
use pgb_liv\php_ms\Core\Chromosome;
use pgb_liv\php_ms\Core\Transcript;
use pgb_liv\php_ms\Core\Database\EnsembleGDatabase;
use pgb_liv\php_ms\Core\Database\EnsembleTDatabase;
use pgb_liv\php_ms\Core\Database\DatabaseFactory;

/**
 * FASTA entry parser to map Ensemble header to protein elements
 *
 * @author Andrew Collins
 */
class EnsembleFastaEntry implements FastaInterface
{

    const CHROMOSOME = 'chromosome';

    const GENE = 'gene';

    const GENE_BIOTYPE = 'gene_biotype';

    const GENE_SYMBOL = 'gene_symbol';

    const TRANSCRIPT = 'transcript';

    const TRANSCRIPT_BIOTYPE = 'transcript_biotype';

    const DESCRIPTION = 'description';

    public static function parseIdentifier($identifier)
    {
        $matches = null;
        $isMatched = preg_match('/^(ENSP|GENSCAN)([0-9]+).([0-9]+)$/', $identifier, $matches);

        if (! $isMatched) {
            throw new \InvalidArgumentException($identifier . ' is not Ensemble format');
        }

        return array(
            $matches[1],
            $matches[2],
            $matches[3]
        );
    }

    /**
     *
     * {@inheritdoc}
     */
    public function getProtein($identifier, $description)
    {
        $protein = new Protein();

        $identifierParts = self::parseIdentifier($identifier);

        $dbEntry = null;

        $database = DatabaseFactory::getDatabase($identifierParts[0]);
        $dbEntry = new DatabaseEntry($database);
        $protein->setDatabaseEntry($dbEntry);

        $dbEntry->setUniqueIdentifier($identifierParts[1]);
        $dbEntry->setEntryVersion($identifierParts[2]);

        $matches = array();
        preg_match_all('/(\w+):(.*?(?=\s\[|\s\w+:))/', $description, $matches);

        $keyValues = array();

        foreach ($matches[1] as $key => $value) {
            switch ($value) {
                case self::CHROMOSOME:
                case self::GENE:
                case self::GENE_BIOTYPE:
                case self::GENE_SYMBOL:
                case self::TRANSCRIPT:
                case self::TRANSCRIPT_BIOTYPE:
                case self::DESCRIPTION:
                    $keyValues[$value] = $matches[2][$key];
                    break;
                default:
                    // Unknown
                    break;
            }
        }

        if (isset($keyValues[self::CHROMOSOME])) {
            $chromosome = new Chromosome();
            $chromosome->setName($keyValues[self::CHROMOSOME]);

            $protein->setChromosome($chromosome);
        }

        if (isset($keyValues[self::GENE])) {
            if (isset($keyValues[self::GENE_SYMBOL])) {
                $gene = Gene::getInstance($keyValues[self::GENE_SYMBOL]);
            } else {
                $gene = new Gene();
            }

            $geneEntry = new DatabaseEntry(EnsembleGDatabase::getInstance());
            $geneEntry->setUniqueIdentifier($keyValues[self::GENE]);
            $gene->setDatabaseEntry($geneEntry);

            $protein->setGene($gene);

            if (isset($keyValues[self::GENE_BIOTYPE])) {
                $gene->setType($keyValues[self::GENE_BIOTYPE]);
            }
        }

        if (isset($keyValues[self::TRANSCRIPT])) {
            $transcript = new Transcript();
            $transcriptEntry = new DatabaseEntry(EnsembleTDatabase::getInstance());
            $transcriptEntry->setUniqueIdentifier($keyValues[self::TRANSCRIPT]);
            $transcript->setDatabaseEntry($transcriptEntry);
            $protein->setTranscript($transcript);

            if (isset($keyValues[self::TRANSCRIPT_BIOTYPE])) {
                $transcript->setType($keyValues[self::TRANSCRIPT_BIOTYPE]);
            }
        }

        if (isset($keyValues[self::DESCRIPTION])) {
            $protein->setDescription($keyValues[self::DESCRIPTION]);
        }

        return $protein;
    }
}