Skip to content
2 changes: 1 addition & 1 deletion .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ install:
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
- conda info -a
- conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy pandas nose
- conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy pandas nose requests
- activate test-environment

test_script:
Expand Down
167 changes: 114 additions & 53 deletions biopandas/mmcif/pandas_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import numpy as np
import pandas as pd
import requests

from ..pdb.engines import amino3to1dict
from ..pdb.pandas_pdb import PandasPdb
Expand Down Expand Up @@ -63,66 +64,89 @@ def read_mmcif(self, path):
self

"""
self.mmcif_path, self.pdb_text = self._read_mmcif(path=path)
self._df = self._construct_df(text=self.pdb_text)
self.mmcif_path, self.mmcif_text = self._read_mmcif(path=path)
self._df = self._construct_df(text=self.mmcif_text)
# self.header, self.code = self._parse_header_code() #TODO: implement
self.code = self.data["entry"]["id"][0].lower()
return self

def fetch_mmcif(self, pdb_code: Optional[str] = None, uniprot_id: Optional[str] = None, source: str = "pdb"):
"""Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
.
def fetch_mmcif(
self,
pdb_code: Optional[str] = None,
uniprot_id: Optional[str] = None,
sequence: Optional[str] = None,
source: str = "pdb",
):
"""
Fetches mmCIF file contents from the Protein Databank at rcsb.org,
AlphaFold database at https://alphafold.ebi.ac.uk/ or ESMFold database.
.

Parameters
----------
pdb_code : str, optional
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
Parameters
----------
pdb_code : str, optional
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the
PDB. Defaults to `None`.

uniprot_id : str, optional
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
uniprot_id : str, optional
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from
the AF2 database. Defaults to `None`.

source : str
The source to retrieve the structure from
(`"pdb"`, `"alphafold2-v1"`, `"alphafold2-v2"` or `"alphafold2-v3"`). Defaults to `"pdb"`.
sequence : str, optional
A protein sequence to retrieve a structure from the ESMFold
database. Defaults to `None`.

Returns
---------
self
source : str
The source to retrieve the structure from. Can be one of
(`"pdb"`, `"alphafold2-v1"`, `"alphafold2-v2"` or
`"alphafold2-v3"`). Defaults to `"pdb"`.

Returns
---------
self

"""
if sequence is not None:
self.esmfold(sequence)
return self
# Sanitize input
invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
invalid_input_combination_2 = pdb_code is not None and source in {
"alphafold2-v1", "alphafold2-v2", "alphafold2-v3"}
"alphafold2-v1",
"alphafold2-v2",
"alphafold2-v3",
}

if invalid_input_identifier_1 or invalid_input_identifier_2:
raise ValueError(
"Please provide either a PDB code or a UniProt ID.")
raise ValueError("Please provide either a PDB code or a UniProt ID.")

if invalid_input_combination_1:
raise ValueError(
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'.")
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
)
elif invalid_input_combination_2:
raise ValueError(
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}.")
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}."
)

if source == "pdb":
self.mmcif_path, self.mmcif_text = self._fetch_mmcif(pdb_code)
elif source == "alphafold2-v1":
af2_version = 1
self.mmcif_path, self.mmcif_text = self._fetch_af2(
uniprot_id, af2_version)
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
elif source == "alphafold2-v2":
af2_version = 2
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
elif source == "alphafold2-v3":
af2_version = 3
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
else:
raise ValueError(f"Invalid source: {source}."
" Please use one of 'pdb', 'alphafold2-v1', 'alphafold2-v2' or 'alphafold2-v3.")
raise ValueError(
f"Invalid source: {source}."
" Please use one of 'pdb', 'alphafold2-v1', 'alphafold2-v2' or 'alphafold2-v3."
)

self._df = self._construct_df(text=self.mmcif_text)
return self
Expand All @@ -132,9 +156,9 @@ def _construct_df(self, text: str):
data = data[list(data.keys())[0]]
self.data = data
df: Dict[str, pd.DataFrame] = {}
full_df = pd.DataFrame.from_dict(
data["atom_site"], orient="index").transpose()
full_df = full_df.astype(mmcif_col_types, errors="ignore")
full_df = pd.DataFrame.from_dict(data["atom_site"], orient="index").transpose()
types = {k: v for k, v in mmcif_col_types.items() if k in full_df.columns}
full_df = full_df.astype(types, errors="ignore")
df["ATOM"] = pd.DataFrame(full_df[full_df.group_PDB == "ATOM"])
df["HETATM"] = pd.DataFrame(full_df[full_df.group_PDB == "HETATM"])
try:
Expand All @@ -152,8 +176,7 @@ def _fetch_mmcif(pdb_code):
response = urlopen(url)
txt = response.read()
txt = (
txt.decode(
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
txt.decode("utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
)
except HTTPError as e:
print(f"HTTP Error {e.code}")
Expand All @@ -170,14 +193,13 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
try:
response = urlopen(url)
txt = response.read()
if sys.version_info[0] >= 3:
txt = txt.decode('utf-8')
else:
txt = txt.encode('ascii')
txt = (
txt.decode("utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
)
except HTTPError as e:
print('HTTP Error %s' % e.code)
print(f"HTTP Error {e.code}")
except URLError as e:
print('URL Error %s' % e.args)
print(f"URL Error {e.args}")
return url, txt

@staticmethod
Expand All @@ -190,8 +212,7 @@ def _read_mmcif(path):
r_mode = "rb"
openf = gzip.open
else:
allowed_formats = ", ".join(
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
allowed_formats = ", ".join((".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
raise ValueError(
f"Wrong file format; allowed file formats are {allowed_formats}"
)
Expand All @@ -201,11 +222,48 @@ def _read_mmcif(path):

if path.endswith(".gz"):
txt = (
txt.decode(
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
txt.decode("utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
)
return path, txt

def esmfold(self, sequence: str, out_path: Optional[str] = None, version: int = 1):
"""Fold a protein sequence using the ESMFold model from the ESMFold server at
https://api.esmatlas.com/foldSequence/v1/pdb/.


Parameters
----------
sequence : str
A protein sequence in one-letter code.
out_path : str, optional
Path to save the PDB file to. If `None`, the file is not saved.
Defaults to `None`.

version : int, optional
The version of the ESMFold model to use. Defaults to `1`.


Returns
--------
self
"""
URL = f"https://api.esmatlas.com/foldSequence/v{version}/cif/"
headers: Dict[str, str] = {
"Content-Type": "application/x-www-form-urlencoded",
}

cif = requests.post(URL, data=sequence, headers=headers).text

# append header
header = "\n".join([f"data_{sequence}", "#", f"_entry.id\t{sequence}", "#\n"])
cif = header + cif
if out_path is not None:
with open(out_path, "w") as f:
f.write(cif)

self._df = self._construct_df(text=cif)
return self

def get(self, s, df=None, invert=False, records=("ATOM", "HETATM")):
"""Filter PDB DataFrames by properties

Expand Down Expand Up @@ -278,8 +336,7 @@ def _get_mainchain(
def _get_hydrogen(df, invert):
"""Return only hydrogen atom entries from a DataFrame"""
return (
df[(df["type_symbol"] != "H")] if invert else df[(
df["type_symbol"] == "H")]
df[(df["type_symbol"] != "H")] if invert else df[(df["type_symbol"] == "H")]
)

@staticmethod
Expand Down Expand Up @@ -346,8 +403,7 @@ def amino3to1(
indices.append(ind)
cmp = num

transl = tmp.iloc[indices][residue_col].map(
amino3to1dict).fillna(fillna)
transl = tmp.iloc[indices][residue_col].map(amino3to1dict).fillna(fillna)

return pd.concat((tmp.iloc[indices][chain_col], transl), axis=1)

Expand Down Expand Up @@ -473,7 +529,7 @@ def _init_get_dict():
"heavy": PandasMmcif._get_heavy,
}

def read_mmcif_from_list(self, mmcif_lines):
def read_mmcif_from_list(self, mmcif_lines: List[str]):
"""Reads mmCIF file from a list into DataFrames

Attributes
Expand All @@ -486,13 +542,15 @@ def read_mmcif_from_list(self, mmcif_lines):
self

"""
self.pdb_text = "".join(mmcif_lines)
self._df = self._construct_df(mmcif_lines)
self.mmcif_text = "\n".join(mmcif_lines)
self._df = self._construct_df("\n".join(mmcif_lines))
# self.header, self.code = self._parse_header_code()
self.code = self.data["entry"]["id"][0].lower()
return self

def get_pandas_pdb(self, offset_chains: bool = True, records: List[str] = ["ATOM", "HETATM"]) -> PandasPdb:
def get_pandas_pdb(
self, offset_chains: bool = True, records: List[str] = ["ATOM", "HETATM"]
) -> PandasPdb:
"""Returns a PandasPdb object with the same data as the PandasMmcif
object.

Expand Down Expand Up @@ -532,10 +590,13 @@ def get_pandas_pdb(self, offset_chains: bool = True, records: List[str] = ["ATOM

# Update atom numbers
if offset_chains:
offsets = pandaspdb.df["ATOM"]["chain_id"].astype(
"category").cat.codes
pandaspdb.df["ATOM"]["atom_number"] = pandaspdb.df["ATOM"]["atom_number"] + offsets
offsets = pandaspdb.df["ATOM"]["chain_id"].astype("category").cat.codes
pandaspdb.df["ATOM"]["atom_number"] = (
pandaspdb.df["ATOM"]["atom_number"] + offsets
)
hetatom_offset = offsets.max() + 1
pandaspdb.df["HETATM"]["atom_number"] = pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
pandaspdb.df["HETATM"]["atom_number"] = (
pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
)

return pandaspdb
7 changes: 3 additions & 4 deletions biopandas/mmcif/tests/test_read_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@
af2_test_struct_v3 = f.read()



def test__read_pdb():
"""Test private _read_pdb"""
ppdb = PandasMmcif()
Expand Down Expand Up @@ -245,7 +244,7 @@ def test_read_pdb():
"""Test public read_pdb"""
ppdb = PandasMmcif()
ppdb.read_mmcif(TESTDATA_FILENAME)
assert ppdb.pdb_text == three_eiy
assert ppdb.mmcif_text == three_eiy
assert ppdb.code == "3eiy", ppdb.code
assert ppdb.mmcif_path == TESTDATA_FILENAME

Expand All @@ -255,8 +254,8 @@ def test_read_pdb_from_list():

for pdb_text, code in zip([three_eiy, four_eiy], ["3eiy", "4eiy"]):
ppdb = PandasMmcif()
ppdb.read_mmcif_from_list(pdb_text)
assert ppdb.pdb_text == pdb_text
ppdb.read_mmcif_from_list(pdb_text.split("\n"))
assert ppdb.mmcif_text == pdb_text
assert ppdb.code == code
assert ppdb.mmcif_path == ""

Expand Down
Loading